Add Lemmatizer and simplify related components (#5848)

* Add Lemmatizer and simplify related components * Add `Lemmatizer` pipe with `lookup` and `rule` modes using the `Lookups` tables. * Reduce `Tagger` to a simple tagger that sets `Token.tag` (no pos or lemma) * Reduce `Morphology` to only keep track of morph tags (no tag map, lemmatizer, or morph rules) * Remove lemmatizer from `Vocab` * Adjust many many tests Differences: * No default lookup lemmas * No special treatment of TAG in `from_array` and similar required * Easier to modify labels in a `Tagger` * No extra strings added from morphology / tag map * Fix test * Initial fix for Lemmatizer config/serialization * Adjust init test to be more generic * Adjust init test to force empty Lookups * Add simple cache to rule-based lemmatizer * Convert language-specific lemmatizers Convert language-specific lemmatizers to component lemmatizers. Remove previous lemmatizer class. * Fix French and Polish lemmatizers * Remove outdated UPOS conversions * Update Russian lemmatizer init in tests * Add minimal init/run tests for custom lemmatizers * Add option to overwrite existing lemmas * Update mode setting, lookup loading, and caching * Make `mode` an immutable property * Only enforce strict `load_lookups` for known supported modes * Move caching into individual `_lemmatize` methods * Implement strict when lang is not found in lookups * Fix tables/lookups in make_lemmatizer * Reallow provided lookups and allow for stricter checks * Add lookups asset to all Lemmatizer pipe tests * Rename lookups in lemmatizer init test * Clean up merge * Refactor lookup table loading * Add helper from `load_lemmatizer_lookups` that loads required and optional lookups tables based on settings provided by a config. Additional slight refactor of lookups: * Add `Lookups.set_table` to set a table from a provided `Table` * Reorder class definitions to be able to specify type as `Table` * Move registry assets into test methods * Refactor lookups tables config Use class methods within `Lemmatizer` to provide the config for particular modes and to load the lookups from a config. * Add pipe and score to lemmatizer * Simplify Tagger.score * Add missing import * Clean up imports and auto-format * Remove unused kwarg * Tidy up and auto-format * Update docstrings for Lemmatizer Update docstrings for Lemmatizer. Additionally modify `is_base_form` API to take `Token` instead of individual features. * Update docstrings * Remove tag map values from Tagger.add_label * Update API docs * Fix relative link in Lemmatizer API docs
2026-01-09 10:11:24 +03:00 · 2020-08-07 15:27:13 +02:00 · 2020-08-07 15:27:13 +02:00 · e962784531
commit e962784531
parent 1d01d89b79
59 changed files with 1439 additions and 1609 deletions
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -19,9 +19,6 @@ after_pipeline_creation = null
 [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"

-[nlp.lemmatizer]
-@lemmatizers = "spacy.Lemmatizer.v1"
-
 [components]

 # Training hyper-parameters and additional features.
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -510,7 +510,7 @@ class Errors:
    E952 = ("The section '{name}' is not a valid section in the provided config.")
    E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
    E954 = ("The Tok2Vec listener did not receive a valid input.")
-    E955 = ("Can't find table '{table}' for language '{lang}' in spacy-lookups-data.")
+    E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
    E956 = ("Can't find component '{name}' in [components] block in the config. "
            "Available components: {opts}")
    E957 = ("Writing directly to Language.factories isn't needed anymore in "
@ -633,6 +633,11 @@ class Errors:
    E1001 = ("Target token outside of matched span for match with tokens "
             "'{span}' and offset '{index}' matched by patterns '{patterns}'.")
    E1002 = ("Span index out of range.")
+    E1003 = ("Unsupported lemmatizer mode '{mode}'.")
+    E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
+             "Required tables '{tables}', found '{found}'. If you are not "
+             "providing custom lookups, make sure you have the package "
+             "spacy-lookups-data installed.")


@add_codes
--- a/spacy/lang/el/init.py
+++ b/spacy/lang/el/init.py
@ -1,38 +1,17 @@
-from typing import Callable
-from thinc.api import Config
+from typing import Optional
+from thinc.api import Model

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-from .lemmatizer import GreekLemmatizer
 from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
-from ...lookups import load_lookups
+from .lemmatizer import GreekLemmatizer
+from ...lookups import Lookups
 from ...language import Language
-from ...util import registry
-
-
-DEFAULT_CONFIG = """
-[nlp]
-
-[nlp.lemmatizer]
-@lemmatizers = "spacy.el.GreekLemmatizer"
-"""
-
-
-@registry.lemmatizers("spacy.el.GreekLemmatizer")
-def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]:
-    tables = ["lemma_index", "lemma_exc", "lemma_rules"]
-
-    def lemmatizer_factory(nlp: Language) -> GreekLemmatizer:
-        lookups = load_lookups(lang=nlp.lang, tables=tables)
-        return GreekLemmatizer(lookups=lookups)
-
-    return lemmatizer_factory


 class GreekDefaults(Language.Defaults):
-    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
@ -47,4 +26,22 @@ class Greek(Language):
    Defaults = GreekDefaults


+@Greek.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Greek"]
--- a/spacy/lang/el/lemmatizer.py
+++ b/spacy/lang/el/lemmatizer.py
@ -1,6 +1,7 @@
-from typing import Dict, List
+from typing import List

-from ...lemmatizer import Lemmatizer
+from ...pipeline import Lemmatizer
+from ...tokens import Token


 class GreekLemmatizer(Lemmatizer):
@ -14,13 +15,27 @@ class GreekLemmatizer(Lemmatizer):
    not applicable for Greek language.
    """

-    def lemmatize(
-        self,
-        string: str,
-        index: Dict[str, List[str]],
-        exceptions: Dict[str, Dict[str, List[str]]],
-        rules: Dict[str, List[List[str]]],
-    ) -> List[str]:
+    def rule_lemmatize(self, token: Token) -> List[str]:
+        """Lemmatize using a rule-based approach.
+
+        token (Token): The token to lemmatize.
+        RETURNS (list): The available lemmas for the string.
+        """
+        cache_key = (token.lower, token.pos)
+        if cache_key in self.cache:
+            return self.cache[cache_key]
+        string = token.text
+        univ_pos = token.pos_.lower()
+        if univ_pos in ("", "eol", "space"):
+            return [string.lower()]
+
+        index_table = self.lookups.get_table("lemma_index", {})
+        exc_table = self.lookups.get_table("lemma_exc", {})
+        rules_table = self.lookups.get_table("lemma_rules", {})
+        index = index_table.get(univ_pos, {})
+        exceptions = exc_table.get(univ_pos, {})
+        rules = rules_table.get(univ_pos, {})
+
        string = string.lower()
        forms = []
        if string in index:
@ -42,4 +57,6 @@ class GreekLemmatizer(Lemmatizer):
            forms.extend(oov_forms)
        if not forms:
            forms.append(string)
-        return list(set(forms))
+        forms = list(set(forms))
+        self.cache[cache_key] = forms
+        return forms
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -1,39 +1,18 @@
-from typing import Callable
-from thinc.api import Config
+from typing import Optional
+
+from thinc.api import Model

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
-from .lemmatizer import is_base_form
 from .punctuation import TOKENIZER_INFIXES
+from .lemmatizer import EnglishLemmatizer
 from ...language import Language
-from ...lemmatizer import Lemmatizer
-from ...lookups import load_lookups
-from ...util import registry
-
-
-DEFAULT_CONFIG = """
-[nlp]
-
-[nlp.lemmatizer]
-@lemmatizers = "spacy.en.EnglishLemmatizer"
-"""
-
-
-@registry.lemmatizers("spacy.en.EnglishLemmatizer")
-def create_lemmatizer() -> Callable[[Language], Lemmatizer]:
-    tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
-
-    def lemmatizer_factory(nlp: Language) -> Lemmatizer:
-        lookups = load_lookups(lang=nlp.lang, tables=tables)
-        return Lemmatizer(lookups=lookups, is_base_form=is_base_form)
-
-    return lemmatizer_factory
+from ...lookups import Lookups


 class EnglishDefaults(Language.Defaults):
-    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    infixes = TOKENIZER_INFIXES
    lex_attr_getters = LEX_ATTRS
@ -46,4 +25,22 @@ class English(Language):
    Defaults = EnglishDefaults


+@English.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["English"]
--- a/spacy/lang/en/lemmatizer.py
+++ b/spacy/lang/en/lemmatizer.py
@ -1,36 +1,43 @@
 from typing import Optional

+from ...pipeline import Lemmatizer
+from ...tokens import Token

-def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
-    """
-    Check whether we're dealing with an uninflected paradigm, so we can
-    avoid lemmatization entirely.

-    univ_pos (unicode / int): The token's universal part-of-speech tag.
-    morphology (dict): The token's morphological features following the
-        Universal Dependencies scheme.
+class EnglishLemmatizer(Lemmatizer):
+    """English lemmatizer. Only overrides is_base_form.
    """
-    if morphology is None:
-        morphology = {}
-    if univ_pos == "noun" and morphology.get("Number") == "sing":
-        return True
-    elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
-        return True
-    # This maps 'VBP' to base form -- probably just need 'IS_BASE'
-    # morphology
-    elif univ_pos == "verb" and (
-        morphology.get("VerbForm") == "fin"
-        and morphology.get("Tense") == "pres"
-        and morphology.get("Number") is None
-    ):
-        return True
-    elif univ_pos == "adj" and morphology.get("Degree") == "pos":
-        return True
-    elif morphology.get("VerbForm") == "inf":
-        return True
-    elif morphology.get("VerbForm") == "none":
-        return True
-    elif morphology.get("Degree") == "pos":
-        return True
-    else:
-        return False
+
+    def is_base_form(self, token: Token) -> bool:
+        """
+        Check whether we're dealing with an uninflected paradigm, so we can
+        avoid lemmatization entirely.
+
+        univ_pos (unicode / int): The token's universal part-of-speech tag.
+        morphology (dict): The token's morphological features following the
+            Universal Dependencies scheme.
+        """
+        univ_pos = token.pos_.lower()
+        morphology = token.morph.to_dict()
+        if univ_pos == "noun" and morphology.get("Number") == "Sing":
+            return True
+        elif univ_pos == "verb" and morphology.get("VerbForm") == "Inf":
+            return True
+        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
+        # morphology
+        elif univ_pos == "verb" and (
+            morphology.get("VerbForm") == "Fin"
+            and morphology.get("Tense") == "Pres"
+            and morphology.get("Number") is None
+        ):
+            return True
+        elif univ_pos == "adj" and morphology.get("Degree") == "Pos":
+            return True
+        elif morphology.get("VerbForm") == "Inf":
+            return True
+        elif morphology.get("VerbForm") == "None":
+            return True
+        elif morphology.get("Degree") == "Pos":
+            return True
+        else:
+            return False
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -1,5 +1,6 @@
-from typing import Callable
-from thinc.api import Config
+from typing import Optional
+
+from thinc.api import Model

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
@ -7,33 +8,12 @@ from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
-from .lemmatizer import FrenchLemmatizer, is_base_form
-from ...lookups import load_lookups
+from .lemmatizer import FrenchLemmatizer
+from ...lookups import Lookups
 from ...language import Language
-from ...util import registry
-
-
-DEFAULT_CONFIG = """
-[nlp]
-
-[nlp.lemmatizer]
-@lemmatizers = "spacy.fr.FrenchLemmatizer"
-"""
-
-
-@registry.lemmatizers("spacy.fr.FrenchLemmatizer")
-def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]:
-    tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
-
-    def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer:
-        lookups = load_lookups(lang=nlp.lang, tables=tables)
-        return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form)
-
-    return lemmatizer_factory


 class FrenchDefaults(Language.Defaults):
-    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
@ -49,4 +29,22 @@ class French(Language):
    Defaults = FrenchDefaults


+@French.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["French"]
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@ -1,8 +1,7 @@
-from typing import Optional, List, Dict
+from typing import List, Dict

-from ...lemmatizer import Lemmatizer
-from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
-from ...symbols import SCONJ, CCONJ
+from ...pipeline import Lemmatizer
+from ...tokens import Token


 class FrenchLemmatizer(Lemmatizer):
@ -15,65 +14,55 @@ class FrenchLemmatizer(Lemmatizer):
    the lookup table.
    """

-    def __call__(
-        self, string: str, univ_pos: str, morphology: Optional[dict] = None
-    ) -> List[str]:
-        lookup_table = self.lookups.get_table("lemma_lookup", {})
-        if "lemma_rules" not in self.lookups:
-            return [lookup_table.get(string, string)]
-        if univ_pos in (NOUN, "NOUN", "noun"):
-            univ_pos = "noun"
-        elif univ_pos in (VERB, "VERB", "verb"):
-            univ_pos = "verb"
-        elif univ_pos in (ADJ, "ADJ", "adj"):
-            univ_pos = "adj"
-        elif univ_pos in (ADP, "ADP", "adp"):
-            univ_pos = "adp"
-        elif univ_pos in (ADV, "ADV", "adv"):
-            univ_pos = "adv"
-        elif univ_pos in (AUX, "AUX", "aux"):
-            univ_pos = "aux"
-        elif univ_pos in (CCONJ, "CCONJ", "cconj"):
-            univ_pos = "cconj"
-        elif univ_pos in (DET, "DET", "det"):
-            univ_pos = "det"
-        elif univ_pos in (PRON, "PRON", "pron"):
-            univ_pos = "pron"
-        elif univ_pos in (PUNCT, "PUNCT", "punct"):
-            univ_pos = "punct"
-        elif univ_pos in (SCONJ, "SCONJ", "sconj"):
-            univ_pos = "sconj"
+    @classmethod
+    def get_lookups_config(cls, mode: str) -> Dict:
+        if mode == "rule":
+            return {
+                "required_tables": [
+                    "lemma_lookup",
+                    "lemma_rules",
+                    "lemma_exc",
+                    "lemma_index",
+                ],
+                "optional_tables": [],
+            }
        else:
-            return [self.lookup(string)]
+            return super().get_lookups_config(mode)
+
+    def rule_lemmatize(self, token: Token) -> List[str]:
+        cache_key = (token.orth, token.pos)
+        if cache_key in self.cache:
+            return self.cache[cache_key]
+        string = token.text
+        univ_pos = token.pos_.lower()
+        if univ_pos in ("", "eol", "space"):
+            return [string.lower()]
+        elif "lemma_rules" not in self.lookups or univ_pos not in (
+            "noun",
+            "verb",
+            "adj",
+            "adp",
+            "adv",
+            "aux",
+            "cconj",
+            "det",
+            "pron",
+            "punct",
+            "sconj",
+        ):
+            return self.lookup_lemmatize(token)
        index_table = self.lookups.get_table("lemma_index", {})
        exc_table = self.lookups.get_table("lemma_exc", {})
        rules_table = self.lookups.get_table("lemma_rules", {})
-        lemmas = self.lemmatize(
-            string,
-            index_table.get(univ_pos, {}),
-            exc_table.get(univ_pos, {}),
-            rules_table.get(univ_pos, []),
-        )
-        return lemmas
-
-    def lookup(self, string: str, orth: Optional[int] = None) -> str:
-        lookup_table = self.lookups.get_table("lemma_lookup", {})
-        if orth is not None and orth in lookup_table:
-            return lookup_table[orth][0]
-        return string
-
-    def lemmatize(
-        self,
-        string: str,
-        index: Dict[str, List[str]],
-        exceptions: Dict[str, Dict[str, List[str]]],
-        rules: Dict[str, List[List[str]]],
-    ) -> List[str]:
        lookup_table = self.lookups.get_table("lemma_lookup", {})
+        index = index_table.get(univ_pos, {})
+        exceptions = exc_table.get(univ_pos, {})
+        rules = rules_table.get(univ_pos, [])
        string = string.lower()
        forms = []
        if string in index:
            forms.append(string)
+            self.cache[cache_key] = forms
            return forms
        forms.extend(exceptions.get(string, []))
        oov_forms = []
@ -90,45 +79,9 @@ class FrenchLemmatizer(Lemmatizer):
        if not forms:
            forms.extend(oov_forms)
        if not forms and string in lookup_table.keys():
-            forms.append(lookup_table[string][0])
+            forms.append(self.lookup_lemmatize(token)[0])
        if not forms:
            forms.append(string)
-        return list(set(forms))
-
-
-def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
-    """
-    Check whether we're dealing with an uninflected paradigm, so we can
-    avoid lemmatization entirely.
-    """
-    morphology = {} if morphology is None else morphology
-    others = [
-        key
-        for key in morphology
-        if key not in (POS, "Number", "POS", "VerbForm", "Tense")
-    ]
-    if univ_pos == "noun" and morphology.get("Number") == "sing":
-        return True
-    elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
-        return True
-    # This maps 'VBP' to base form -- probably just need 'IS_BASE'
-    # morphology
-    elif univ_pos == "verb" and (
-        morphology.get("VerbForm") == "fin"
-        and morphology.get("Tense") == "pres"
-        and morphology.get("Number") is None
-        and not others
-    ):
-        return True
-    elif univ_pos == "adj" and morphology.get("Degree") == "pos":
-        return True
-    elif "VerbForm=inf" in morphology:
-        return True
-    elif "VerbForm=none" in morphology:
-        return True
-    elif "Number=sing" in morphology:
-        return True
-    elif "Degree=pos" in morphology:
-        return True
-    else:
-        return False
+        forms = list(set(forms))
+        self.cache[cache_key] = forms
+        return forms
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -38,8 +38,6 @@ def create_tokenizer(split_mode: Optional[str] = None):
 class JapaneseTokenizer(DummyTokenizer):
    def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
        self.vocab = nlp.vocab
-        # TODO: is this the right way to do it?
-        self.vocab.morphology.load_tag_map(TAG_MAP)
        self.split_mode = split_mode
        self.tokenizer = try_sudachi_import(self.split_mode)

--- a/spacy/lang/ko/init.py
+++ b/spacy/lang/ko/init.py
@ -7,6 +7,7 @@ from .lex_attrs import LEX_ATTRS
 from ...language import Language
 from ...tokens import Doc
 from ...compat import copy_reg
+from ...symbols import POS
 from ...util import DummyTokenizer, registry


@ -29,8 +30,6 @@ def create_tokenizer():
 class KoreanTokenizer(DummyTokenizer):
    def __init__(self, nlp: Optional[Language] = None):
        self.vocab = nlp.vocab
-        # TODO: is this the right way to do it?
-        self.vocab.morphology.load_tag_map(TAG_MAP)
        MeCab = try_mecab_import()
        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")

@ -44,6 +43,7 @@ class KoreanTokenizer(DummyTokenizer):
        for token, dtoken in zip(doc, dtokens):
            first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
            token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
+            token.pos = TAG_MAP[token.tag_][POS]
            token.lemma_ = dtoken["lemma"]
        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
        return doc
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -1,5 +1,6 @@
-from typing import Callable
-from thinc.api import Config
+from typing import Optional
+
+from thinc.api import Model

 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -7,32 +8,11 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .lemmatizer import DutchLemmatizer
-from ...lookups import load_lookups
+from ...lookups import Lookups
 from ...language import Language
-from ...util import registry
-
-
-DEFAULT_CONFIG = """
-[nlp]
-
-[nlp.lemmatizer]
-@lemmatizers = "spacy.nl.DutchLemmatizer"
-"""
-
-
-@registry.lemmatizers("spacy.nl.DutchLemmatizer")
-def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]:
-    tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
-
-    def lemmatizer_factory(nlp: Language) -> DutchLemmatizer:
-        lookups = load_lookups(lang=nlp.lang, tables=tables)
-        return DutchLemmatizer(lookups=lookups)
-
-    return lemmatizer_factory


 class DutchDefaults(Language.Defaults):
-    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
@ -46,4 +26,22 @@ class Dutch(Language):
    Defaults = DutchDefaults


+@Dutch.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Dutch"]
--- a/spacy/lang/nl/lemmatizer.py
+++ b/spacy/lang/nl/lemmatizer.py
@ -1,44 +1,34 @@
-from typing import Optional, List, Dict, Tuple
+from typing import List, Dict

-from ...lemmatizer import Lemmatizer
-from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
+from ...pipeline import Lemmatizer
+from ...tokens import Token


 class DutchLemmatizer(Lemmatizer):
-    # Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
-    univ_pos_name_variants = {
-        NOUN: "noun",
-        "NOUN": "noun",
-        "noun": "noun",
-        VERB: "verb",
-        "VERB": "verb",
-        "verb": "verb",
-        AUX: "verb",
-        "AUX": "verb",
-        "aux": "verb",
-        ADJ: "adj",
-        "ADJ": "adj",
-        "adj": "adj",
-        ADV: "adv",
-        "ADV": "adv",
-        "adv": "adv",
-        PRON: "pron",
-        "PRON": "pron",
-        "pron": "pron",
-        DET: "det",
-        "DET": "det",
-        "det": "det",
-        ADP: "adp",
-        "ADP": "adp",
-        "adp": "adp",
-        NUM: "num",
-        "NUM": "num",
-        "num": "num",
-    }
+    @classmethod
+    def get_lookups_config(cls, mode: str) -> Dict:
+        if mode == "rule":
+            return {
+                "required_tables": [
+                    "lemma_lookup",
+                    "lemma_rules",
+                    "lemma_exc",
+                    "lemma_index",
+                ],
+            }
+        else:
+            return super().get_lookups_config(mode)

-    def __call__(
-        self, string: str, univ_pos: str, morphology: Optional[dict] = None
-    ) -> List[str]:
+    def lookup_lemmatize(self, token: Token) -> List[str]:
+        """Overrides parent method so that a lowercased version of the string
+        is used to search the lookup table. This is necessary because our
+        lookup table consists entirely of lowercase keys."""
+        lookup_table = self.lookups.get_table("lemma_lookup", {})
+        string = token.text.lower()
+        return [lookup_table.get(string, string)]
+
+    # Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
+    def rule_lemmatize(self, token: Token) -> List[str]:
        # Difference 1: self.rules is assumed to be non-None, so no
        # 'is None' check required.
        # String lowercased from the get-go. All lemmatization results in
@ -46,74 +36,61 @@ class DutchLemmatizer(Lemmatizer):
        # any problems, and it keeps the exceptions indexes small. If this
        # creates problems for proper nouns, we can introduce a check for
        # univ_pos == "PROPN".
-        string = string.lower()
-        try:
-            univ_pos = self.univ_pos_name_variants[univ_pos]
-        except KeyError:
-            # Because PROPN not in self.univ_pos_name_variants, proper names
-            # are not lemmatized. They are lowercased, however.
-            return [string]
-            # if string in self.lemma_index.get(univ_pos)
+        cache_key = (token.lower, token.pos)
+        if cache_key in self.cache:
+            return self.cache[cache_key]
+        string = token.text
+        univ_pos = token.pos_.lower()
+        if univ_pos in ("", "eol", "space"):
+            forms = [string.lower()]
+            self.cache[cache_key] = forms
+            return forms
+
        index_table = self.lookups.get_table("lemma_index", {})
+        exc_table = self.lookups.get_table("lemma_exc", {})
+        rules_table = self.lookups.get_table("lemma_rules", {})
+        index = index_table.get(univ_pos, {})
+        exceptions = exc_table.get(univ_pos, {})
+        rules = rules_table.get(univ_pos, {})
+
+        string = string.lower()
+        if univ_pos not in (
+            "noun",
+            "verb",
+            "aux",
+            "adj",
+            "adv",
+            "pron",
+            "det",
+            "adp",
+            "num",
+        ):
+            forms = [string]
+            self.cache[cache_key] = forms
+            return forms
        lemma_index = index_table.get(univ_pos, {})
        # string is already lemma
        if string in lemma_index:
-            return [string]
+            forms = [string]
+            self.cache[cache_key] = forms
+            return forms
        exc_table = self.lookups.get_table("lemma_exc", {})
        exceptions = exc_table.get(univ_pos, {})
        # string is irregular token contained in exceptions index.
        try:
-            lemma = exceptions[string]
-            return [lemma[0]]
+            forms = [exceptions[string][0]]
+            self.cache[cache_key] = forms
+            return forms
        except KeyError:
            pass
        # string corresponds to key in lookup table
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        looked_up_lemma = lookup_table.get(string)
        if looked_up_lemma and looked_up_lemma in lemma_index:
-            return [looked_up_lemma]
+            forms = [looked_up_lemma]
+            self.cache[cache_key] = forms
+            return forms
        rules_table = self.lookups.get_table("lemma_rules", {})
-        forms, is_known = self.lemmatize(
-            string, lemma_index, exceptions, rules_table.get(univ_pos, [])
-        )
-        # Back-off through remaining return value candidates.
-        if forms:
-            if is_known:
-                return forms
-            else:
-                for form in forms:
-                    if form in exceptions:
-                        return [form]
-            if looked_up_lemma:
-                return [looked_up_lemma]
-            else:
-                return forms
-        elif looked_up_lemma:
-            return [looked_up_lemma]
-        else:
-            return [string]
-
-    # Overrides parent method so that a lowercased version of the string is
-    # used to search the lookup table. This is necessary because our lookup
-    # table consists entirely of lowercase keys.
-    def lookup(self, string: str, orth: Optional[int] = None) -> str:
-        lookup_table = self.lookups.get_table("lemma_lookup", {})
-        string = string.lower()
-        if orth is not None:
-            return lookup_table.get(orth, string)
-        else:
-            return lookup_table.get(string, string)
-
-    # Reimplemented to focus more on application of suffix rules and to return
-    # as early as possible.
-    def lemmatize(
-        self,
-        string: str,
-        index: Dict[str, List[str]],
-        exceptions: Dict[str, Dict[str, List[str]]],
-        rules: Dict[str, List[List[str]]],
-    ) -> Tuple[List[str], bool]:
-        # returns (forms, is_known: bool)
        oov_forms = []
        for old, new in rules:
            if string.endswith(old):
@ -121,7 +98,31 @@ class DutchLemmatizer(Lemmatizer):
                if not form:
                    pass
                elif form in index:
-                    return [form], True  # True = Is known (is lemma)
+                    forms = [form]
+                    self.cache[cache_key] = forms
+                    return forms
                else:
                    oov_forms.append(form)
-        return list(set(oov_forms)), False
+        forms = list(set(oov_forms))
+        # Back-off through remaining return value candidates.
+        if forms:
+            for form in forms:
+                if form in exceptions:
+                    forms = [form]
+                    self.cache[cache_key] = forms
+                    return forms
+            if looked_up_lemma:
+                forms = [looked_up_lemma]
+                self.cache[cache_key] = forms
+                return forms
+            else:
+                self.cache[cache_key] = forms
+                return forms
+        elif looked_up_lemma:
+            forms = [looked_up_lemma]
+            self.cache[cache_key] = forms
+            return forms
+        else:
+            forms = [string]
+            self.cache[cache_key] = forms
+            return forms
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -1,5 +1,6 @@
-from typing import Callable
-from thinc.api import Config
+from typing import Optional
+
+from thinc.api import Model

 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
@ -7,42 +8,16 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import PolishLemmatizer
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...lookups import load_lookups
+from ...lookups import Lookups
 from ...language import Language
-from ...util import registry


-DEFAULT_CONFIG = """
-[nlp]
-
-[nlp.lemmatizer]
-@lemmatizers = "spacy.pl.PolishLemmatizer"
-"""
-
 TOKENIZER_EXCEPTIONS = {
    exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
 }


-@registry.lemmatizers("spacy.pl.PolishLemmatizer")
-def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]:
-    # fmt: off
-    tables = [
-        "lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv",
-        "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num",
-        "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"
-    ]
-    # fmt: on
-
-    def lemmatizer_factory(nlp: Language) -> PolishLemmatizer:
-        lookups = load_lookups(lang=nlp.lang, tables=tables)
-        return PolishLemmatizer(lookups=lookups)
-
-    return lemmatizer_factory
-
-
 class PolishDefaults(Language.Defaults):
-    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
@ -56,4 +31,22 @@ class Polish(Language):
    Defaults = PolishDefaults


+@Polish.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "lookup", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Polish"]
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@ -1,7 +1,7 @@
-from typing import Optional, List, Dict
+from typing import List, Dict

-from ...lemmatizer import Lemmatizer
-from ...parts_of_speech import NAMES
+from ...pipeline import Lemmatizer
+from ...tokens import Token


 class PolishLemmatizer(Lemmatizer):
@ -9,12 +9,30 @@ class PolishLemmatizer(Lemmatizer):
    # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
    # It utilizes some prefix based improvements for verb and adjectives
    # lemmatization, as well as case-sensitive lemmatization for nouns.
-    def __call__(
-        self, string: str, univ_pos: str, morphology: Optional[dict] = None
-    ) -> List[str]:
-        if isinstance(univ_pos, int):
-            univ_pos = NAMES.get(univ_pos, "X")
-        univ_pos = univ_pos.upper()
+
+    @classmethod
+    def get_lookups_config(cls, mode: str) -> Dict:
+        if mode == "lookup":
+            return {
+                "required_tables": [
+                    "lemma_lookup_adj",
+                    "lemma_lookup_adp",
+                    "lemma_lookup_adv",
+                    "lemma_lookup_aux",
+                    "lemma_lookup_noun",
+                    "lemma_lookup_num",
+                    "lemma_lookup_part",
+                    "lemma_lookup_pron",
+                    "lemma_lookup_verb",
+                ]
+            }
+        else:
+            return super().get_lookups_config(mode)
+
+    def lookup_lemmatize(self, token: Token) -> List[str]:
+        string = token.text
+        univ_pos = token.pos_
+        morphology = token.morph.to_dict()
        lookup_pos = univ_pos.lower()
        if univ_pos == "PROPN":
            lookup_pos = "noun"
@ -71,15 +89,3 @@ class PolishLemmatizer(Lemmatizer):
                return [lookup_table[string]]
            return [string.lower()]
        return [lookup_table.get(string, string)]
-
-    def lookup(self, string: str, orth: Optional[int] = None) -> str:
-        return string.lower()
-
-    def lemmatize(
-        self,
-        string: str,
-        index: Dict[str, List[str]],
-        exceptions: Dict[str, Dict[str, List[str]]],
-        rules: Dict[str, List[List[str]]],
-    ) -> List[str]:
-        raise NotImplementedError
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -1,32 +1,16 @@
-from typing import Callable
-from thinc.api import Config
+from typing import Optional
+
+from thinc.api import Model

 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import RussianLemmatizer
-from ...util import registry
 from ...language import Language
-
-
-DEFAULT_CONFIG = """
-[nlp]
-
-[nlp.lemmatizer]
-@lemmatizers = "spacy.ru.RussianLemmatizer"
-"""
-
-
-@registry.lemmatizers("spacy.ru.RussianLemmatizer")
-def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]:
-    def lemmatizer_factory(nlp: Language) -> RussianLemmatizer:
-        return RussianLemmatizer()
-
-    return lemmatizer_factory
+from ...lookups import Lookups


 class RussianDefaults(Language.Defaults):
-    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
@ -37,4 +21,21 @@ class Russian(Language):
    Defaults = RussianDefaults


+@Russian.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "pymorphy2", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Russian"]
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -1,8 +1,12 @@
-from typing import Optional, Tuple, Dict, List
+from typing import Optional, List, Dict, Tuple
+
+from thinc.api import Model

-from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
-from ...lemmatizer import Lemmatizer
 from ...lookups import Lookups
+from ...pipeline import Lemmatizer
+from ...symbols import POS
+from ...tokens import Token
+from ...vocab import Vocab


 PUNCT_RULES = {"«": '"', "»": '"'}
@ -11,8 +15,17 @@ PUNCT_RULES = {"«": '"', "»": '"'}
 class RussianLemmatizer(Lemmatizer):
    _morph = None

-    def __init__(self, lookups: Optional[Lookups] = None) -> None:
-        super(RussianLemmatizer, self).__init__(lookups)
+    def __init__(
+        self,
+        vocab: Vocab,
+        model: Optional[Model],
+        name: str = "lemmatizer",
+        *,
+        mode: str = "pymorphy2",
+        lookups: Optional[Lookups] = None,
+    ) -> None:
+        super().__init__(vocab, model, name, mode=mode, lookups=lookups)
+
        try:
            from pymorphy2 import MorphAnalyzer
        except ImportError:
@ -25,10 +38,10 @@ class RussianLemmatizer(Lemmatizer):
        if RussianLemmatizer._morph is None:
            RussianLemmatizer._morph = MorphAnalyzer()

-    def __call__(
-        self, string: str, univ_pos: str, morphology: Optional[dict] = None
-    ) -> List[str]:
-        univ_pos = self.normalize_univ_pos(univ_pos)
+    def pymorphy2_lemmatize(self, token: Token) -> List[str]:
+        string = token.text
+        univ_pos = token.pos_
+        morphology = token.morph.to_dict()
        if univ_pos == "PUNCT":
            return [PUNCT_RULES.get(string, string)]
        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
@ -81,25 +94,8 @@ class RussianLemmatizer(Lemmatizer):
            return [string.lower()]
        return list(set([analysis.normal_form for analysis in filtered_analyses]))

-    @staticmethod
-    def normalize_univ_pos(univ_pos: str) -> Optional[str]:
-        if isinstance(univ_pos, str):
-            return univ_pos.upper()
-        symbols_to_str = {
-            ADJ: "ADJ",
-            DET: "DET",
-            NOUN: "NOUN",
-            NUM: "NUM",
-            PRON: "PRON",
-            PROPN: "PROPN",
-            PUNCT: "PUNCT",
-            VERB: "VERB",
-        }
-        if univ_pos in symbols_to_str:
-            return symbols_to_str[univ_pos]
-        return None
-
-    def lookup(self, string: str, orth: Optional[int] = None) -> str:
+    def lookup_lemmatize(self, token: Token) -> List[str]:
+        string = token.text
        analyses = self._morph.parse(string)
        if len(analyses) == 1:
            return analyses[0].normal_form
--- a/spacy/lang/uk/init.py
+++ b/spacy/lang/uk/init.py
@ -1,32 +1,16 @@
-from typing import Callable
-from thinc.api import Config
+from typing import Optional
+
+from thinc.api import Model

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-from ...util import registry
-from ...language import Language
 from .lemmatizer import UkrainianLemmatizer
-
-
-DEFAULT_CONFIG = """
-[nlp]
-
-[nlp.lemmatizer]
-@lemmatizers = "spacy.uk.UkrainianLemmatizer"
-"""
-
-
-@registry.lemmatizers("spacy.uk.UkrainianLemmatizer")
-def create_ukrainian_lemmatizer() -> Callable[[Language], UkrainianLemmatizer]:
-    def lemmatizer_factory(nlp: Language) -> UkrainianLemmatizer:
-        return UkrainianLemmatizer()
-
-    return lemmatizer_factory
+from ...language import Language
+from ...lookups import Lookups


 class UkrainianDefaults(Language.Defaults):
-    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
@ -37,4 +21,21 @@ class Ukrainian(Language):
    Defaults = UkrainianDefaults


+@Ukrainian.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "pymorphy2", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Ukrainian"]
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -1,187 +1,30 @@
-from typing import Optional, List, Tuple, Dict
+from typing import Optional

-from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
+from thinc.api import Model
+
+from ..ru.lemmatizer import RussianLemmatizer
 from ...lookups import Lookups
-from ...lemmatizer import Lemmatizer
+from ...vocab import Vocab


-PUNCT_RULES = {"«": '"', "»": '"'}
-
-
-class UkrainianLemmatizer(Lemmatizer):
-    _morph = None
-
-    def __init__(self, lookups: Optional[Lookups] = None) -> None:
-        super(UkrainianLemmatizer, self).__init__(lookups)
+class UkrainianLemmatizer(RussianLemmatizer):
+    def __init__(
+        self,
+        vocab: Vocab,
+        model: Optional[Model],
+        name: str = "lemmatizer",
+        *,
+        mode: str = "pymorphy2",
+        lookups: Optional[Lookups] = None,
+    ) -> None:
+        super().__init__(vocab, model, name, mode=mode, lookups=lookups)
        try:
            from pymorphy2 import MorphAnalyzer
-
-            if UkrainianLemmatizer._morph is None:
-                UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
-        except (ImportError, TypeError):
+        except ImportError:
            raise ImportError(
                "The Ukrainian lemmatizer requires the pymorphy2 library and "
                'dictionaries: try to fix it with "pip uninstall pymorphy2" and'
                '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
            ) from None
-
-    def __call__(
-        self, string: str, univ_pos: str, morphology: Optional[dict] = None
-    ) -> List[str]:
-        univ_pos = self.normalize_univ_pos(univ_pos)
-        if univ_pos == "PUNCT":
-            return [PUNCT_RULES.get(string, string)]
-        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
-            # Skip unchangeable pos
-            return [string.lower()]
-        analyses = self._morph.parse(string)
-        filtered_analyses = []
-        for analysis in analyses:
-            if not analysis.is_known:
-                # Skip suggested parse variant for unknown word for pymorphy
-                continue
-            analysis_pos, _ = oc2ud(str(analysis.tag))
-            if analysis_pos == univ_pos or (
-                analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
-            ):
-                filtered_analyses.append(analysis)
-        if not len(filtered_analyses):
-            return [string.lower()]
-        if morphology is None or (len(morphology) == 1 and POS in morphology):
-            return list(set([analysis.normal_form for analysis in filtered_analyses]))
-        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
-            features_to_compare = ["Case", "Number", "Gender"]
-        elif univ_pos == "NUM":
-            features_to_compare = ["Case", "Gender"]
-        elif univ_pos == "PRON":
-            features_to_compare = ["Case", "Number", "Gender", "Person"]
-        else:  # VERB
-            features_to_compare = [
-                "Aspect",
-                "Gender",
-                "Mood",
-                "Number",
-                "Tense",
-                "VerbForm",
-                "Voice",
-            ]
-        analyses, filtered_analyses = filtered_analyses, []
-        for analysis in analyses:
-            _, analysis_morph = oc2ud(str(analysis.tag))
-            for feature in features_to_compare:
-                if (
-                    feature in morphology
-                    and feature in analysis_morph
-                    and morphology[feature].lower() != analysis_morph[feature].lower()
-                ):
-                    break
-            else:
-                filtered_analyses.append(analysis)
-        if not len(filtered_analyses):
-            return [string.lower()]
-        return list(set([analysis.normal_form for analysis in filtered_analyses]))
-
-    @staticmethod
-    def normalize_univ_pos(univ_pos: str) -> Optional[str]:
-        if isinstance(univ_pos, str):
-            return univ_pos.upper()
-        symbols_to_str = {
-            ADJ: "ADJ",
-            DET: "DET",
-            NOUN: "NOUN",
-            NUM: "NUM",
-            PRON: "PRON",
-            PROPN: "PROPN",
-            PUNCT: "PUNCT",
-            VERB: "VERB",
-        }
-        if univ_pos in symbols_to_str:
-            return symbols_to_str[univ_pos]
-        return None
-
-    def lookup(self, string: str, orth: Optional[int] = None) -> str:
-        analyses = self._morph.parse(string)
-        if len(analyses) == 1:
-            return analyses[0].normal_form
-        return string
-
-
-def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
-    gram_map = {
-        "_POS": {
-            "ADJF": "ADJ",
-            "ADJS": "ADJ",
-            "ADVB": "ADV",
-            "Apro": "DET",
-            "COMP": "ADJ",  # Can also be an ADV - unchangeable
-            "CONJ": "CCONJ",  # Can also be a SCONJ - both unchangeable ones
-            "GRND": "VERB",
-            "INFN": "VERB",
-            "INTJ": "INTJ",
-            "NOUN": "NOUN",
-            "NPRO": "PRON",
-            "NUMR": "NUM",
-            "NUMB": "NUM",
-            "PNCT": "PUNCT",
-            "PRCL": "PART",
-            "PREP": "ADP",
-            "PRTF": "VERB",
-            "PRTS": "VERB",
-            "VERB": "VERB",
-        },
-        "Animacy": {"anim": "Anim", "inan": "Inan"},
-        "Aspect": {"impf": "Imp", "perf": "Perf"},
-        "Case": {
-            "ablt": "Ins",
-            "accs": "Acc",
-            "datv": "Dat",
-            "gen1": "Gen",
-            "gen2": "Gen",
-            "gent": "Gen",
-            "loc2": "Loc",
-            "loct": "Loc",
-            "nomn": "Nom",
-            "voct": "Voc",
-        },
-        "Degree": {"COMP": "Cmp", "Supr": "Sup"},
-        "Gender": {"femn": "Fem", "masc": "Masc", "neut": "Neut"},
-        "Mood": {"impr": "Imp", "indc": "Ind"},
-        "Number": {"plur": "Plur", "sing": "Sing"},
-        "NumForm": {"NUMB": "Digit"},
-        "Person": {"1per": "1", "2per": "2", "3per": "3", "excl": "2", "incl": "1"},
-        "Tense": {"futr": "Fut", "past": "Past", "pres": "Pres"},
-        "Variant": {"ADJS": "Brev", "PRTS": "Brev"},
-        "VerbForm": {
-            "GRND": "Conv",
-            "INFN": "Inf",
-            "PRTF": "Part",
-            "PRTS": "Part",
-            "VERB": "Fin",
-        },
-        "Voice": {"actv": "Act", "pssv": "Pass"},
-        "Abbr": {"Abbr": "Yes"},
-    }
-    pos = "X"
-    morphology = dict()
-    unmatched = set()
-    grams = oc_tag.replace(" ", ",").split(",")
-    for gram in grams:
-        match = False
-        for categ, gmap in sorted(gram_map.items()):
-            if gram in gmap:
-                match = True
-                if categ == "_POS":
-                    pos = gmap[gram]
-                else:
-                    morphology[categ] = gmap[gram]
-        if not match:
-            unmatched.add(gram)
-    while len(unmatched) > 0:
-        gram = unmatched.pop()
-        if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
-            pos = "PROPN"
-        elif gram == "Auxt":
-            pos = "AUX"
-        elif gram == "Pltm":
-            morphology["Number"] = "Ptan"
-    return pos, morphology
+        if UkrainianLemmatizer._morph is None:
+            UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
--- a/spacy/language.py
+++ b/spacy/language.py
@ -29,7 +29,6 @@ from .lang.punctuation import TOKENIZER_INFIXES
 from .tokens import Doc
 from .lookups import load_lookups
 from .tokenizer import Tokenizer
-from .lemmatizer import Lemmatizer
 from .errors import Errors, Warnings
 from .schemas import ConfigSchema
 from .git_info import GIT_VERSION
@ -87,22 +86,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
    return tokenizer_factory


-@registry.lemmatizers("spacy.Lemmatizer.v1")
-def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]:
-    """Registered function to create a lemmatizer. Returns a factory that takes
-    the nlp object and returns a Lemmatizer instance with data loaded in from
-    spacy-lookups-data, if the package is installed.
-    """
-    # TODO: Will be replaced when the lemmatizer becomes a pipeline component
-    tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
-
-    def lemmatizer_factory(nlp: "Language") -> "Lemmatizer":
-        lookups = load_lookups(lang=nlp.lang, tables=tables, strict=False)
-        return Lemmatizer(lookups=lookups)
-
-    return lemmatizer_factory
-
-
 class Language:
    """A text-processing pipeline. Usually you'll load this once per process,
    and pass the instance around your application.
@ -128,7 +111,6 @@ class Language:
        max_length: int = 10 ** 6,
        meta: Dict[str, Any] = {},
        create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
-        create_lemmatizer: Optional[Callable[["Language"], Callable]] = None,
        **kwargs,
    ) -> None:
        """Initialise a Language object.
@ -146,8 +128,6 @@ class Language:
            100,000 characters in one text.
        create_tokenizer (Callable): Function that takes the nlp object and
            returns a tokenizer.
-        create_lemmatizer (Callable): Function that takes the nlp object and
-            returns a lemmatizer.

        DOCS: https://spacy.io/api/language#init
        """
@ -166,13 +146,9 @@ class Language:

        if vocab is True:
            vectors_name = meta.get("vectors", {}).get("name")
-            if not create_lemmatizer:
-                lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
-                create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
            vocab = create_vocab(
                self.lang,
                self.Defaults,
-                lemmatizer=create_lemmatizer(self),
                vectors_name=vectors_name,
                load_data=self._config["nlp"]["load_vocab_data"],
            )
@ -1451,7 +1427,6 @@ class Language:
        filled["components"] = orig_pipeline
        config["components"] = orig_pipeline
        create_tokenizer = resolved["nlp"]["tokenizer"]
-        create_lemmatizer = resolved["nlp"]["lemmatizer"]
        before_creation = resolved["nlp"]["before_creation"]
        after_creation = resolved["nlp"]["after_creation"]
        after_pipeline_creation = resolved["nlp"]["after_pipeline_creation"]
@ -1467,7 +1442,6 @@ class Language:
        nlp = lang_cls(
            vocab=vocab,
            create_tokenizer=create_tokenizer,
-            create_lemmatizer=create_lemmatizer,
        )
        if after_creation is not None:
            nlp = after_creation(nlp)
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -1,145 +0,0 @@
-from typing import Optional, Callable, List, Dict
-
-from .lookups import Lookups
-from .parts_of_speech import NAMES as UPOS_NAMES
-
-
-class Lemmatizer:
-    """
-    The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
-    lookup tables.
-
-    DOCS: https://spacy.io/api/lemmatizer
-    """
-
-    def __init__(
-        self,
-        lookups: Optional[Lookups] = None,
-        is_base_form: Optional[Callable] = None,
-    ) -> None:
-        """Initialize a Lemmatizer.
-
-        lookups (Lookups): The lookups object containing the (optional) tables
-            "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
-        """
-        self.lookups = lookups if lookups is not None else Lookups()
-        self.is_base_form = is_base_form
-
-    def __call__(
-        self, string: str, univ_pos: str, morphology: Optional[dict] = None
-    ) -> List[str]:
-        """Lemmatize a string.
-
-        string (str): The string to lemmatize, e.g. the token text.
-        univ_pos (str / int): The token's universal part-of-speech tag.
-        morphology (dict): The token's morphological features following the
-            Universal Dependencies scheme.
-        RETURNS (list): The available lemmas for the string.
-        """
-        lookup_table = self.lookups.get_table("lemma_lookup", {})
-        if "lemma_rules" not in self.lookups:
-            return [lookup_table.get(string, string)]
-        if isinstance(univ_pos, int):
-            univ_pos = UPOS_NAMES.get(univ_pos, "X")
-        univ_pos = univ_pos.lower()
-        if univ_pos in ("", "eol", "space"):
-            return [string.lower()]
-        # See Issue #435 for example of where this logic is requied.
-        if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):
-            return [string.lower()]
-        index_table = self.lookups.get_table("lemma_index", {})
-        exc_table = self.lookups.get_table("lemma_exc", {})
-        rules_table = self.lookups.get_table("lemma_rules", {})
-        if not any(
-            (
-                index_table.get(univ_pos),
-                exc_table.get(univ_pos),
-                rules_table.get(univ_pos),
-            )
-        ):
-            if univ_pos == "propn":
-                return [string]
-            else:
-                return [string.lower()]
-        lemmas = self.lemmatize(
-            string,
-            index_table.get(univ_pos, {}),
-            exc_table.get(univ_pos, {}),
-            rules_table.get(univ_pos, []),
-        )
-        return lemmas
-
-    def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]:
-        return self(string, "noun", morphology)
-
-    def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]:
-        return self(string, "verb", morphology)
-
-    def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]:
-        return self(string, "adj", morphology)
-
-    def det(self, string: str, morphology: Optional[dict] = None) -> List[str]:
-        return self(string, "det", morphology)
-
-    def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]:
-        return self(string, "pron", morphology)
-
-    def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]:
-        return self(string, "adp", morphology)
-
-    def num(self, string: str, morphology: Optional[dict] = None) -> List[str]:
-        return self(string, "num", morphology)
-
-    def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]:
-        return self(string, "punct", morphology)
-
-    def lookup(self, string: str, orth: Optional[int] = None) -> str:
-        """Look up a lemma in the table, if available. If no lemma is found,
-        the original string is returned.
-
-        string (str): The original string.
-        orth (int): Optional hash of the string to look up. If not set, the
-            string will be used and hashed.
-        RETURNS (str): The lemma if the string was found, otherwise the
-            original string.
-        """
-        lookup_table = self.lookups.get_table("lemma_lookup", {})
-        key = orth if orth is not None else string
-        if key in lookup_table:
-            return lookup_table[key]
-        return string
-
-    def lemmatize(
-        self,
-        string: str,
-        index: Dict[str, List[str]],
-        exceptions: Dict[str, Dict[str, List[str]]],
-        rules: Dict[str, List[List[str]]],
-    ) -> List[str]:
-        orig = string
-        string = string.lower()
-        forms = []
-        oov_forms = []
-        for old, new in rules:
-            if string.endswith(old):
-                form = string[: len(string) - len(old)] + new
-                if not form:
-                    pass
-                elif form in index or not form.isalpha():
-                    forms.append(form)
-                else:
-                    oov_forms.append(form)
-        # Remove duplicates but preserve the ordering of applied "rules"
-        forms = list(dict.fromkeys(forms))
-        # Put exceptions at the front of the list, so they get priority.
-        # This is a dodgy heuristic -- but it's the best we can do until we get
-        # frequencies on this. We can at least prune out problematic exceptions,
-        # if they shadow more frequent analyses.
-        for form in exceptions.get(string, []):
-            if form not in forms:
-                forms.insert(0, form)
-        if not forms:
-            forms.extend(oov_forms)
-        if not forms:
-            forms.append(orig)
-        return forms
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -28,6 +28,8 @@ def load_lookups(
    # TODO: import spacy_lookups_data instead of going via entry points here?
    lookups = Lookups()
    if lang not in registry.lookups:
+        if strict and len(tables) > 0:
+            raise ValueError(Errors.E955.format(table=", ".join(tables), lang=lang))
        return lookups
    data = registry.lookups.get(lang)
    for table in tables:
@ -41,152 +43,6 @@ def load_lookups(
    return lookups


-class Lookups:
-    """Container for large lookup tables and dictionaries, e.g. lemmatization
-    data or tokenizer exception lists. Lookups are available via vocab.lookups,
-    so they can be accessed before the pipeline components are applied (e.g.
-    in the tokenizer and lemmatizer), as well as within the pipeline components
-    via doc.vocab.lookups.
-    """
-
-    def __init__(self) -> None:
-        """Initialize the Lookups object.
-
-        DOCS: https://spacy.io/api/lookups#init
-        """
-        self._tables = {}
-
-    def __contains__(self, name: str) -> bool:
-        """Check if the lookups contain a table of a given name. Delegates to
-        Lookups.has_table.
-
-        name (str): Name of the table.
-        RETURNS (bool): Whether a table of that name is in the lookups.
-        """
-        return self.has_table(name)
-
-    def __len__(self) -> int:
-        """RETURNS (int): The number of tables in the lookups."""
-        return len(self._tables)
-
-    @property
-    def tables(self) -> List[str]:
-        """RETURNS (List[str]): Names of all tables in the lookups."""
-        return list(self._tables.keys())
-
-    def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table":
-        """Add a new table to the lookups. Raises an error if the table exists.
-
-        name (str): Unique name of table.
-        data (dict): Optional data to add to the table.
-        RETURNS (Table): The newly added table.
-
-        DOCS: https://spacy.io/api/lookups#add_table
-        """
-        if name in self.tables:
-            raise ValueError(Errors.E158.format(name=name))
-        table = Table(name=name, data=data)
-        self._tables[name] = table
-        return table
-
-    def get_table(self, name: str, default: Any = UNSET) -> "Table":
-        """Get a table. Raises an error if the table doesn't exist and no
-        default value is provided.
-
-        name (str): Name of the table.
-        default (Any): Optional default value to return if table doesn't exist.
-        RETURNS (Table): The table.
-
-        DOCS: https://spacy.io/api/lookups#get_table
-        """
-        if name not in self._tables:
-            if default == UNSET:
-                raise KeyError(Errors.E159.format(name=name, tables=self.tables))
-            return default
-        return self._tables[name]
-
-    def remove_table(self, name: str) -> "Table":
-        """Remove a table. Raises an error if the table doesn't exist.
-
-        name (str): Name of the table to remove.
-        RETURNS (Table): The removed table.
-
-        DOCS: https://spacy.io/api/lookups#remove_table
-        """
-        if name not in self._tables:
-            raise KeyError(Errors.E159.format(name=name, tables=self.tables))
-        return self._tables.pop(name)
-
-    def has_table(self, name: str) -> bool:
-        """Check if the lookups contain a table of a given name.
-
-        name (str): Name of the table.
-        RETURNS (bool): Whether a table of that name exists.
-
-        DOCS: https://spacy.io/api/lookups#has_table
-        """
-        return name in self._tables
-
-    def to_bytes(self, **kwargs) -> bytes:
-        """Serialize the lookups to a bytestring.
-
-        RETURNS (bytes): The serialized Lookups.
-
-        DOCS: https://spacy.io/api/lookups#to_bytes
-        """
-        return srsly.msgpack_dumps(self._tables)
-
-    def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
-        """Load the lookups from a bytestring.
-
-        bytes_data (bytes): The data to load.
-        RETURNS (Lookups): The loaded Lookups.
-
-        DOCS: https://spacy.io/api/lookups#from_bytes
-        """
-        self._tables = {}
-        for key, value in srsly.msgpack_loads(bytes_data).items():
-            self._tables[key] = Table(key, value)
-        return self
-
-    def to_disk(
-        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
-    ) -> None:
-        """Save the lookups to a directory as lookups.bin. Expects a path to a
-        directory, which will be created if it doesn't exist.
-
-        path (str / Path): The file path.
-
-        DOCS: https://spacy.io/api/lookups#to_disk
-        """
-        if len(self._tables):
-            path = ensure_path(path)
-            if not path.exists():
-                path.mkdir()
-            filepath = path / filename
-            with filepath.open("wb") as file_:
-                file_.write(self.to_bytes())
-
-    def from_disk(
-        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
-    ) -> "Lookups":
-        """Load lookups from a directory containing a lookups.bin. Will skip
-        loading if the file doesn't exist.
-
-        path (str / Path): The directory path.
-        RETURNS (Lookups): The loaded lookups.
-
-        DOCS: https://spacy.io/api/lookups#from_disk
-        """
-        path = ensure_path(path)
-        filepath = path / filename
-        if filepath.exists():
-            with filepath.open("rb") as file_:
-                data = file_.read()
-            return self.from_bytes(data)
-        return self
-
-
 class Table(OrderedDict):
    """A table in the lookups. Subclass of builtin dict that implements a
    slightly more consistent and unified API.
@ -303,3 +159,159 @@ class Table(OrderedDict):
        self.clear()
        self.update(data)
        return self
+
+
+class Lookups:
+    """Container for large lookup tables and dictionaries, e.g. lemmatization
+    data or tokenizer exception lists. Lookups are available via vocab.lookups,
+    so they can be accessed before the pipeline components are applied (e.g.
+    in the tokenizer and lemmatizer), as well as within the pipeline components
+    via doc.vocab.lookups.
+    """
+
+    def __init__(self) -> None:
+        """Initialize the Lookups object.
+
+        DOCS: https://spacy.io/api/lookups#init
+        """
+        self._tables = {}
+
+    def __contains__(self, name: str) -> bool:
+        """Check if the lookups contain a table of a given name. Delegates to
+        Lookups.has_table.
+
+        name (str): Name of the table.
+        RETURNS (bool): Whether a table of that name is in the lookups.
+        """
+        return self.has_table(name)
+
+    def __len__(self) -> int:
+        """RETURNS (int): The number of tables in the lookups."""
+        return len(self._tables)
+
+    @property
+    def tables(self) -> List[str]:
+        """RETURNS (List[str]): Names of all tables in the lookups."""
+        return list(self._tables.keys())
+
+    def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> Table:
+        """Add a new table to the lookups. Raises an error if the table exists.
+
+        name (str): Unique name of table.
+        data (dict): Optional data to add to the table.
+        RETURNS (Table): The newly added table.
+
+        DOCS: https://spacy.io/api/lookups#add_table
+        """
+        if name in self.tables:
+            raise ValueError(Errors.E158.format(name=name))
+        table = Table(name=name, data=data)
+        self._tables[name] = table
+        return table
+
+    def set_table(self, name: str, table: Table) -> None:
+        """Set a table.
+
+        name (str): Name of the table to set.
+        table (Table): The Table to set.
+
+        DOCS: https://spacy.io/api/lookups#set_table
+        """
+        self._tables[name] = table
+
+    def get_table(self, name: str, default: Any = UNSET) -> Table:
+        """Get a table. Raises an error if the table doesn't exist and no
+        default value is provided.
+
+        name (str): Name of the table.
+        default (Any): Optional default value to return if table doesn't exist.
+        RETURNS (Table): The table.
+
+        DOCS: https://spacy.io/api/lookups#get_table
+        """
+        if name not in self._tables:
+            if default == UNSET:
+                raise KeyError(Errors.E159.format(name=name, tables=self.tables))
+            return default
+        return self._tables[name]
+
+    def remove_table(self, name: str) -> Table:
+        """Remove a table. Raises an error if the table doesn't exist.
+
+        name (str): Name of the table to remove.
+        RETURNS (Table): The removed table.
+
+        DOCS: https://spacy.io/api/lookups#remove_table
+        """
+        if name not in self._tables:
+            raise KeyError(Errors.E159.format(name=name, tables=self.tables))
+        return self._tables.pop(name)
+
+    def has_table(self, name: str) -> bool:
+        """Check if the lookups contain a table of a given name.
+
+        name (str): Name of the table.
+        RETURNS (bool): Whether a table of that name exists.
+
+        DOCS: https://spacy.io/api/lookups#has_table
+        """
+        return name in self._tables
+
+    def to_bytes(self, **kwargs) -> bytes:
+        """Serialize the lookups to a bytestring.
+
+        RETURNS (bytes): The serialized Lookups.
+
+        DOCS: https://spacy.io/api/lookups#to_bytes
+        """
+        return srsly.msgpack_dumps(self._tables)
+
+    def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
+        """Load the lookups from a bytestring.
+
+        bytes_data (bytes): The data to load.
+        RETURNS (Lookups): The loaded Lookups.
+
+        DOCS: https://spacy.io/api/lookups#from_bytes
+        """
+        self._tables = {}
+        for key, value in srsly.msgpack_loads(bytes_data).items():
+            self._tables[key] = Table(key, value)
+        return self
+
+    def to_disk(
+        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
+    ) -> None:
+        """Save the lookups to a directory as lookups.bin. Expects a path to a
+        directory, which will be created if it doesn't exist.
+
+        path (str / Path): The file path.
+
+        DOCS: https://spacy.io/api/lookups#to_disk
+        """
+        if len(self._tables):
+            path = ensure_path(path)
+            if not path.exists():
+                path.mkdir()
+            filepath = path / filename
+            with filepath.open("wb") as file_:
+                file_.write(self.to_bytes())
+
+    def from_disk(
+        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
+    ) -> "Lookups":
+        """Load lookups from a directory containing a lookups.bin. Will skip
+        loading if the file doesn't exist.
+
+        path (str / Path): The directory path.
+        RETURNS (Lookups): The loaded lookups.
+
+        DOCS: https://spacy.io/api/lookups#from_disk
+        """
+        path = ensure_path(path)
+        filepath = path / filename
+        if filepath.exists():
+            with filepath.open("rb") as file_:
+                data = file_.read()
+            return self.from_bytes(data)
+        return self
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -27,12 +27,6 @@ cdef class Morphology:
    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
    cdef int insert(self, MorphAnalysisC tag) except -1

-    cdef int assign_untagged(self, TokenC* token) except -1
-    cdef int assign_tag(self, TokenC* token, tag) except -1
-    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
-
-    cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1
-

 cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
 cdef list list_features(const MorphAnalysisC* morph)
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -31,43 +31,15 @@ cdef class Morphology:
    VALUE_SEP = ","
    EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0

-    def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None):
+    def __init__(self, StringStore strings):
        self.mem = Pool()
        self.strings = strings
        self.tags = PreshMap()
-        self.load_tag_map(tag_map)
-        self.lemmatizer = lemmatizer
-
-        self._cache = PreshMapArray(self.n_tags)
-        self._exc = {}
-        if exc is not None:
-            self.load_morph_exceptions(exc)
-
-    def load_tag_map(self, tag_map):
-        self.tag_map = {}
-        self.reverse_index = {}
-        # Add special space symbol. We prefix with underscore, to make sure it
-        # always sorts to the end.
-        if '_SP' in tag_map:
-            space_attrs = tag_map.get('_SP')
-        else:
-            space_attrs = tag_map.get('SP', {POS: SPACE})
-        if '_SP' not in tag_map:
-            self.strings.add('_SP')
-            tag_map = dict(tag_map)
-            tag_map['_SP'] = space_attrs
-        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
-            attrs = self.normalize_attrs(attrs)
-            self.add(attrs)
-            self.tag_map[tag_str] = dict(attrs)
-            self.reverse_index[self.strings.add(tag_str)] = i
-        self.tag_names = tuple(sorted(self.tag_map.keys()))
-        self.n_tags = len(self.tag_map)
-        self._cache = PreshMapArray(self.n_tags)

    def __reduce__(self):
-        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
-                self.exc), None, None)
+        tags = set([self.get(self.strings[s]) for s in self.strings])
+        tags -= set([""])
+        return (unpickle_morphology, (self.strings, sorted(tags)), None, None)

    def add(self, features):
        """Insert a morphological analysis in the morphology table, if not
@ -185,115 +157,6 @@ cdef class Morphology:
        else:
            return self.strings[tag.key]

-    def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
-        if orth not in self.strings:
-            return orth
-        cdef unicode py_string = self.strings[orth]
-        if self.lemmatizer is None:
-            return self.strings.add(py_string.lower())
-        cdef list lemma_strings
-        cdef unicode lemma_string
-        # Normalize features into a dict keyed by the field, to make life easier
-        # for the lemmatizer. Handles string-to-int conversion too.
-        string_feats = {}
-        for key, value in morphology.items():
-            if value is True:
-                name, value = self.strings.as_string(key).split('_', 1)
-                string_feats[name] = value
-            else:
-                string_feats[self.strings.as_string(key)] = self.strings.as_string(value)
-        lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats)
-        lemma_string = lemma_strings[0]
-        lemma = self.strings.add(lemma_string)
-        return lemma
-
-    def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
-                         force=False):
-        """Add a special-case rule to the morphological analyser. Tokens whose
-        tag and orth match the rule will receive the specified properties.
-
-        tag (str): The part-of-speech tag to key the exception.
-        orth (str): The word-form to key the exception.
-        """
-        attrs = dict(attrs)
-        attrs = self.normalize_attrs(attrs)
-        self.add(attrs)
-        attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
-        self._exc[(tag_str, self.strings.add(orth_str))] = attrs
-
-    cdef int assign_untagged(self, TokenC* token) except -1:
-        """Set morphological attributes on a token without a POS tag. Uses
-        the lemmatizer's lookup() method, which looks up the string in the
-        table provided by the language data as lemma_lookup (if available).
-        """
-        if token.lemma == 0:
-            orth_str = self.strings[token.lex.orth]
-            lemma = self.lemmatizer.lookup(orth_str, orth=token.lex.orth)
-            token.lemma = self.strings.add(lemma)
-
-    cdef int assign_tag(self, TokenC* token, tag_str) except -1:
-        cdef attr_t tag = self.strings.as_int(tag_str)
-        if tag in self.reverse_index:
-            tag_id = self.reverse_index[tag]
-            self.assign_tag_id(token, tag_id)
-        else:
-            token.tag = tag
-
-    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
-        if tag_id > self.n_tags:
-            raise ValueError(Errors.E014.format(tag=tag_id))
-        # Ensure spaces get tagged as space.
-        # It seems pretty arbitrary to put this logic here, but there's really
-        # nowhere better. I guess the justification is that this is where the
-        # specific word and the tag interact. Still, we should have a better
-        # way to enforce this rule, or figure out why the statistical model fails.
-        # Related to Issue #220
-        if Lexeme.c_check_flag(token.lex, IS_SPACE):
-            tag_id = self.reverse_index[self.strings.add('_SP')]
-        tag_str = self.tag_names[tag_id]
-        features = dict(self.tag_map.get(tag_str, {}))
-        if features:
-            pos = self.strings.as_int(features.pop(POS))
-        else:
-            pos = 0
-        cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
-        if lemma == 0:
-            # Ugh, self.lemmatize has opposite arg order from self.lemmatizer :(
-            lemma = self.lemmatize(pos, token.lex.orth, features)
-            self._cache.set(tag_id, token.lex.orth, <void*>lemma)
-        token.lemma = lemma
-        token.pos = <univ_pos_t>pos
-        token.tag = self.strings[tag_str]
-        token.morph = self.add(features)
-        if (self.tag_names[tag_id], token.lex.orth) in self._exc:
-            self._assign_tag_from_exceptions(token, tag_id)
-
-    cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
-        key = (self.tag_names[tag_id], token.lex.orth)
-        cdef dict attrs
-        attrs = self._exc[key]
-        token.pos = attrs.get(POS, token.pos)
-        token.lemma = attrs.get(LEMMA, token.lemma)
-
-    def load_morph_exceptions(self, dict morph_rules):
-        self._exc = {}
-        # Map (form, pos) to attributes
-        for tag, exc in morph_rules.items():
-            for orth, attrs in exc.items():
-                attrs = self.normalize_attrs(attrs)
-                self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs)
-
-    @property
-    def exc(self):
-        # generate the serializable exc in the MORPH_RULES format from the
-        # internal tuple-key format
-        morph_rules = {}
-        for (tag, orth) in sorted(self._exc):
-            if not tag in morph_rules:
-                morph_rules[tag] = {}
-            morph_rules[tag][self.strings[orth]] = self._exc[(tag, orth)]
-        return morph_rules
-
    @staticmethod
    def feats_to_dict(feats):
        if not feats or feats == Morphology.EMPTY_MORPH:
@ -338,3 +201,9 @@ cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t fie
            results[n_results] = morph.features[i]
            n_results += 1
    return n_results
+
+def unpickle_morphology(strings, tags):
+    cdef Morphology morphology = Morphology(strings)
+    for tag in tags:
+        morphology.add(tag)
+    return morphology
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -3,9 +3,10 @@ from .dep_parser import DependencyParser
 from .entity_linker import EntityLinker
 from .ner import EntityRecognizer
 from .entityruler import EntityRuler
+from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .pipe import Pipe
-from spacy.pipeline.senter import SentenceRecognizer
+from .senter import SentenceRecognizer
 from .sentencizer import Sentencizer
 from .simple_ner import SimpleNER
 from .tagger import Tagger
@ -20,6 +21,7 @@ __all__ = [
    "EntityRecognizer",
    "EntityRuler",
    "Morphologizer",
+    "Lemmatizer",
    "Pipe",
    "SentenceRecognizer",
    "Sentencizer",
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -0,0 +1,330 @@
+from typing import Optional, List, Dict, Any
+
+from thinc.api import Model
+
+from .pipe import Pipe
+from ..errors import Errors
+from ..language import Language
+from ..lookups import Lookups, load_lookups
+from ..scorer import Scorer
+from ..tokens import Doc, Token
+from ..vocab import Vocab
+from .. import util
+
+
+@Language.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={
+        "model": None,
+        "mode": "lookup",
+        "lookups": None,
+        "overwrite": False,
+    },
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+    overwrite: bool = False,
+):
+    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(
+        nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
+    )
+
+
+class Lemmatizer(Pipe):
+    """
+    The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
+    lookup tables.
+
+    DOCS: https://spacy.io/api/lemmatizer
+    """
+
+    @classmethod
+    def get_lookups_config(cls, mode: str) -> Dict:
+        """Returns the lookups configuration settings for a given mode for use
+        in Lemmatizer.load_lookups.
+
+        mode (str): The lemmatizer mode.
+        RETURNS (dict): The lookups configuration settings for this mode.
+
+        DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
+        """
+        if mode == "lookup":
+            return {
+                "required_tables": ["lemma_lookup"],
+            }
+        elif mode == "rule":
+            return {
+                "required_tables": ["lemma_rules"],
+                "optional_tables": ["lemma_exc", "lemma_index"],
+            }
+        return {}
+
+    @classmethod
+    def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups:
+        """Load and validate lookups tables. If the provided lookups is None,
+        load the default lookups tables according to the language and mode
+        settings. Confirm that all required tables for the language and mode
+        are present.
+
+        lang (str): The language code.
+        mode (str): The lemmatizer mode.
+        lookups (Lookups): The provided lookups, may be None if the default
+            lookups should be loaded.
+        RETURNS (Lookups): The Lookups object.
+
+        DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
+        """
+        config = cls.get_lookups_config(mode)
+        required_tables = config.get("required_tables", [])
+        optional_tables = config.get("optional_tables", [])
+        if lookups is None:
+            lookups = load_lookups(lang=lang, tables=required_tables)
+            optional_lookups = load_lookups(
+                lang=lang, tables=optional_tables, strict=False
+            )
+            for table in optional_lookups.tables:
+                lookups.set_table(table, optional_lookups.get_table(table))
+        for table in required_tables:
+            if table not in lookups:
+                raise ValueError(
+                    Errors.E1004.format(
+                        mode=mode, tables=required_tables, found=lookups.tables
+                    )
+                )
+        return lookups
+
+    def __init__(
+        self,
+        vocab: Vocab,
+        model: Optional[Model],
+        name: str = "lemmatizer",
+        *,
+        mode: str = "lookup",
+        lookups: Optional[Lookups] = None,
+        overwrite: bool = False,
+    ) -> None:
+        """Initialize a Lemmatizer.
+
+        vocab (Vocab): The vocab.
+        model (Model): A model (not yet implemented).
+        name (str): The component name. Defaults to "lemmatizer".
+        mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
+        lookups (Lookups): The lookups object containing the (optional) tables
+            such as "lemma_rules", "lemma_index", "lemma_exc" and
+            "lemma_lookup". Defaults to None
+        overwrite (bool): Whether to overwrite existing lemmas. Defaults to
+            `False`.
+
+        DOCS: https://spacy.io/api/lemmatizer#init
+        """
+        self.vocab = vocab
+        self.model = model
+        self._mode = mode
+        self.lookups = lookups if lookups is not None else Lookups()
+        self.overwrite = overwrite
+        if self.mode == "lookup":
+            self.lemmatize = self.lookup_lemmatize
+        elif self.mode == "rule":
+            self.lemmatize = self.rule_lemmatize
+        else:
+            try:
+                self.lemmatize = getattr(self, f"{self.mode}_lemmatize")
+            except AttributeError:
+                raise ValueError(Errors.E1003.format(mode=mode))
+        self.cache = {}
+
+    @property
+    def mode(self):
+        return self._mode
+
+    def __call__(self, doc: Doc) -> Doc:
+        """Apply the lemmatizer to one document.
+
+        doc (Doc): The Doc to process.
+        RETURNS (Doc): The processed Doc.
+
+        DOCS: https://spacy.io/api/lemmatizer#call
+        """
+        for token in doc:
+            if self.overwrite or token.lemma == 0:
+                token.lemma_ = self.lemmatize(token)[0]
+        return doc
+
+    def pipe(self, stream, *, batch_size=128):
+        """Apply the pipe to a stream of documents. This usually happens under
+        the hood when the nlp object is called on a text and all components are
+        applied to the Doc.
+
+        stream (Iterable[Doc]): A stream of documents.
+        batch_size (int): The number of documents to buffer.
+        YIELDS (Doc): Processed documents in order.
+
+        DOCS: https://spacy.io/api/lemmatizer#pipe
+        """
+        for doc in stream:
+            doc = self(doc)
+            yield doc
+
+    def lookup_lemmatize(self, token: Token) -> List[str]:
+        """Lemmatize using a lookup-based approach.
+
+        token (Token): The token to lemmatize.
+        RETURNS (list): The available lemmas for the string.
+
+        DOCS: https://spacy.io/api/lemmatizer#lookup_lemmatize
+        """
+        lookup_table = self.lookups.get_table("lemma_lookup", {})
+        result = lookup_table.get(token.text, token.text)
+        if isinstance(result, str):
+            result = [result]
+        return result
+
+    def rule_lemmatize(self, token: Token) -> List[str]:
+        """Lemmatize using a rule-based approach.
+
+        token (Token): The token to lemmatize.
+        RETURNS (list): The available lemmas for the string.
+
+        DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize
+        """
+        cache_key = (token.orth, token.pos, token.morph)
+        if cache_key in self.cache:
+            return self.cache[cache_key]
+        string = token.text
+        univ_pos = token.pos_.lower()
+        if univ_pos in ("", "eol", "space"):
+            return [string.lower()]
+        # See Issue #435 for example of where this logic is requied.
+        if self.is_base_form(token):
+            return [string.lower()]
+        index_table = self.lookups.get_table("lemma_index", {})
+        exc_table = self.lookups.get_table("lemma_exc", {})
+        rules_table = self.lookups.get_table("lemma_rules", {})
+        if not any(
+            (
+                index_table.get(univ_pos),
+                exc_table.get(univ_pos),
+                rules_table.get(univ_pos),
+            )
+        ):
+            if univ_pos == "propn":
+                return [string]
+            else:
+                return [string.lower()]
+
+        index = index_table.get(univ_pos, {})
+        exceptions = exc_table.get(univ_pos, {})
+        rules = rules_table.get(univ_pos, {})
+        orig = string
+        string = string.lower()
+        forms = []
+        oov_forms = []
+        for old, new in rules:
+            if string.endswith(old):
+                form = string[: len(string) - len(old)] + new
+                if not form:
+                    pass
+                elif form in index or not form.isalpha():
+                    forms.append(form)
+                else:
+                    oov_forms.append(form)
+        # Remove duplicates but preserve the ordering of applied "rules"
+        forms = list(dict.fromkeys(forms))
+        # Put exceptions at the front of the list, so they get priority.
+        # This is a dodgy heuristic -- but it's the best we can do until we get
+        # frequencies on this. We can at least prune out problematic exceptions,
+        # if they shadow more frequent analyses.
+        for form in exceptions.get(string, []):
+            if form not in forms:
+                forms.insert(0, form)
+        if not forms:
+            forms.extend(oov_forms)
+        if not forms:
+            forms.append(orig)
+        self.cache[cache_key] = forms
+        return forms
+
+    def is_base_form(self, token: Token) -> bool:
+        """Check whether the token is a base form that does not need further
+        analysis for lemmatization.
+
+        token (Token): The token.
+        RETURNS (bool): Whether the token is a base form.
+
+        DOCS: https://spacy.io/api/lemmatizer#is_base_form
+        """
+        return False
+
+    def score(self, examples, **kwargs) -> Dict[str, Any]:
+        """Score a batch of examples.
+
+        examples (Iterable[Example]): The examples to score.
+        RETURNS (Dict[str, Any]): The scores.
+
+        DOCS: https://spacy.io/api/lemmatizer#score
+        """
+        return Scorer.score_token_attr(examples, "lemma", **kwargs)
+
+    def to_disk(self, path, *, exclude=tuple()):
+        """Save the current state to a directory.
+
+        path (unicode or Path): A path to a directory, which will be created if
+            it doesn't exist.
+        exclude (list): String names of serialization fields to exclude.
+
+        DOCS: https://spacy.io/api/vocab#to_disk
+        """
+        serialize = {}
+        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
+        serialize["lookups"] = lambda p: self.lookups.to_disk(p)
+        util.to_disk(path, serialize, exclude)
+
+    def from_disk(self, path, *, exclude=tuple()):
+        """Loads state from a directory. Modifies the object in place and
+        returns it.
+
+        path (unicode or Path): A path to a directory.
+        exclude (list): String names of serialization fields to exclude.
+        RETURNS (Vocab): The modified `Vocab` object.
+
+        DOCS: https://spacy.io/api/vocab#to_disk
+        """
+        deserialize = {}
+        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
+        deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
+        util.from_disk(path, deserialize, exclude)
+
+    def to_bytes(self, *, exclude=tuple()) -> bytes:
+        """Serialize the current state to a binary string.
+
+        exclude (list): String names of serialization fields to exclude.
+        RETURNS (bytes): The serialized form of the `Vocab` object.
+
+        DOCS: https://spacy.io/api/vocab#to_bytes
+        """
+        serialize = {}
+        serialize["vocab"] = self.vocab.to_bytes
+        serialize["lookups"] = self.lookups.to_bytes
+        return util.to_bytes(serialize, exclude)
+
+    def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
+        """Load state from a binary string.
+
+        bytes_data (bytes): The data to load from.
+        exclude (list): String names of serialization fields to exclude.
+        RETURNS (Vocab): The `Vocab` object.
+
+        DOCS: https://spacy.io/api/vocab#from_bytes
+        """
+        deserialize = {}
+        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
+        deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
+        util.from_bytes(bytes_data, deserialize, exclude)
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -39,12 +39,12 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "tagger",
    assigns=["token.tag"],
-    default_config={"model": DEFAULT_TAGGER_MODEL, "set_morphology": False},
-    scores=["tag_acc", "pos_acc", "lemma_acc"],
+    default_config={"model": DEFAULT_TAGGER_MODEL},
+    scores=["tag_acc"],
    default_score_weights={"tag_acc": 1.0},
 )
-def make_tagger(nlp: Language, name: str, model: Model, set_morphology: bool):
-    return Tagger(nlp.vocab, model, name, set_morphology=set_morphology)
+def make_tagger(nlp: Language, name: str, model: Model):
+    return Tagger(nlp.vocab, model, name)


 class Tagger(Pipe):
@ -52,13 +52,14 @@ class Tagger(Pipe):

    DOCS: https://spacy.io/api/tagger
    """
-    def __init__(self, vocab, model, name="tagger", *, set_morphology=False):
+    def __init__(self, vocab, model, name="tagger", *, labels=None):
        """Initialize a part-of-speech tagger.

        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
+        labels (List): The set of labels. Defaults to None.
        set_morphology (bool): Whether to set morphological features.

        DOCS: https://spacy.io/api/tagger#init
@ -67,7 +68,7 @@ class Tagger(Pipe):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        cfg = {"set_morphology": set_morphology}
+        cfg = {"labels": labels or []}
        self.cfg = dict(sorted(cfg.items()))

    @property
@ -80,7 +81,7 @@ class Tagger(Pipe):

        DOCS: https://spacy.io/api/tagger#labels
        """
-        return tuple(self.vocab.morphology.tag_names)
+        return tuple(self.cfg["labels"])

    def __call__(self, doc):
        """Apply the pipe to a Doc.
@ -150,9 +151,7 @@ class Tagger(Pipe):
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
-        cdef int idx = 0
        cdef Vocab vocab = self.vocab
-        assign_morphology = self.cfg.get("set_morphology", True)
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
@ -160,15 +159,7 @@ class Tagger(Pipe):
            for j, tag_id in enumerate(doc_tag_ids):
                # Don't clobber preset POS tags
                if doc.c[j].tag == 0:
-                    if doc.c[j].pos == 0 and assign_morphology:
-                        # Don't clobber preset lemmas
-                        lemma = doc.c[j].lemma
-                        vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
-                        if lemma != 0 and lemma != doc.c[j].lex.orth:
-                            doc.c[j].lemma = lemma
-                    else:
-                        doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
-                idx += 1
+                    doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
            doc.is_tagged = True

    def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
@ -279,55 +270,26 @@ class Tagger(Pipe):

        DOCS: https://spacy.io/api/tagger#begin_training
        """
-        lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
-        if not any(table in self.vocab.lookups for table in lemma_tables):
-            warnings.warn(Warnings.W022)
-        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
-        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
-            langs = ", ".join(util.LEXEME_NORM_LANGS)
-            warnings.warn(Warnings.W033.format(model="part-of-speech tagger", langs=langs))
-        orig_tag_map = dict(self.vocab.morphology.tag_map)
-        new_tag_map = {}
+        tags = set()
        for example in get_examples():
            try:
                y = example.y
            except AttributeError:
                raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) from None
            for token in y:
-                tag = token.tag_
-                if tag in orig_tag_map:
-                    new_tag_map[tag] = orig_tag_map[tag]
-                else:
-                    new_tag_map[tag] = {POS: X}
-
-        cdef Vocab vocab = self.vocab
-        if new_tag_map:
-            if "_SP" in orig_tag_map:
-                new_tag_map["_SP"] = orig_tag_map["_SP"]
-            vocab.morphology.load_tag_map(new_tag_map)
+                tags.add(token.tag_)
+        for tag in sorted(tags):
+            self.add_label(tag)
        self.set_output(len(self.labels))
-        doc_sample = [Doc(self.vocab, words=["hello", "world"])]
-        if pipeline is not None:
-            for name, component in pipeline:
-                if component is self:
-                    break
-                if hasattr(component, "pipe"):
-                    doc_sample = list(component.pipe(doc_sample))
-                else:
-                    doc_sample = [component(doc) for doc in doc_sample]
-        self.model.initialize(X=doc_sample)
-        # Get batch of example docs, example outputs to call begin_training().
-        # This lets the model infer shapes.
+        self.model.initialize()
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd

-    def add_label(self, label, values=None):
+    def add_label(self, label):
        """Add a new label to the pipe.

        label (str): The label to add.
-        values (Dict[int, str]): Optional values to map to the label, e.g. a
-            tag map dictionary.
        RETURNS (int): 0 if label is already present, otherwise 1.

        DOCS: https://spacy.io/api/tagger#add_label
@ -336,22 +298,8 @@ class Tagger(Pipe):
            raise ValueError(Errors.E187)
        if label in self.labels:
            return 0
-        if self.model.has_dim("nO"):
-            # Here's how the model resizing will work, once the
-            # neuron-to-tag mapping is no longer controlled by
-            # the Morphology class, which sorts the tag names.
-            # The sorting makes adding labels difficult.
-            # smaller = self.model._layers[-1]
-            # larger = Softmax(len(self.labels)+1, smaller.nI)
-            # copy_array(larger.W[:smaller.nO], smaller.W)
-            # copy_array(larger.b[:smaller.nO], smaller.b)
-            # self.model._layers[-1] = larger
-            raise ValueError(TempErrors.T003)
-        tag_map = dict(self.vocab.morphology.tag_map)
-        if values is None:
-            values = {POS: "X"}
-        tag_map[label] = values
-        self.vocab.morphology.load_tag_map(tag_map)
+        self.cfg["labels"].append(label)
+        self.vocab.strings.add(label)
        return 1

    def score(self, examples, **kwargs):
@ -363,11 +311,7 @@ class Tagger(Pipe):

        DOCS: https://spacy.io/api/tagger#score
        """
-        scores = {}
-        scores.update(Scorer.score_token_attr(examples, "tag", **kwargs))
-        scores.update(Scorer.score_token_attr(examples, "pos", **kwargs))
-        scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
-        return scores
+        return Scorer.score_token_attr(examples, "tag", **kwargs)

    def to_bytes(self, *, exclude=tuple()):
        """Serialize the pipe to a bytestring.
@ -381,10 +325,6 @@ class Tagger(Pipe):
        serialize["model"] = self.model.to_bytes
        serialize["vocab"] = self.vocab.to_bytes
        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
-        serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
-        morph_rules = dict(self.vocab.morphology.exc)
-        serialize["morph_rules"] = lambda: srsly.msgpack_dumps(morph_rules)
        return util.to_bytes(serialize, exclude)

    def from_bytes(self, bytes_data, *, exclude=tuple()):
@ -402,21 +342,8 @@ class Tagger(Pipe):
            except AttributeError:
                raise ValueError(Errors.E149) from None

-        def load_tag_map(b):
-            tag_map = srsly.msgpack_loads(b)
-            self.vocab.morphology.load_tag_map(tag_map)
-
-        def load_morph_rules(b):
-            morph_rules = srsly.msgpack_loads(b)
-            self.vocab.morphology.load_morph_exceptions(morph_rules)
-
-        self.vocab.morphology = Morphology(self.vocab.strings, dict(),
-            lemmatizer=self.vocab.morphology.lemmatizer)
-
        deserialize = {
            "vocab": lambda b: self.vocab.from_bytes(b),
-            "tag_map": load_tag_map,
-            "morph_rules": load_morph_rules,
            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
            "model": lambda b: load_model(b),
        }
@ -431,12 +358,8 @@ class Tagger(Pipe):

        DOCS: https://spacy.io/api/tagger#to_disk
        """
-        tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
-        morph_rules = dict(self.vocab.morphology.exc)
        serialize = {
            "vocab": lambda p: self.vocab.to_disk(p),
-            "tag_map": lambda p: srsly.write_msgpack(p, tag_map),
-            "morph_rules": lambda p: srsly.write_msgpack(p, morph_rules),
            "model": lambda p: self.model.to_disk(p),
            "cfg": lambda p: srsly.write_json(p, self.cfg),
        }
@ -458,22 +381,9 @@ class Tagger(Pipe):
                except AttributeError:
                    raise ValueError(Errors.E149) from None

-        def load_tag_map(p):
-            tag_map = srsly.read_msgpack(p)
-            self.vocab.morphology.load_tag_map(tag_map)
-
-        def load_morph_rules(p):
-            morph_rules = srsly.read_msgpack(p)
-            self.vocab.morphology.load_morph_exceptions(morph_rules)
-
-        self.vocab.morphology = Morphology(self.vocab.strings, dict(),
-            lemmatizer=self.vocab.morphology.lemmatizer)
-
        deserialize = {
            "vocab": lambda p: self.vocab.from_disk(p),
            "cfg": lambda p: self.cfg.update(deserialize_config(p)),
-            "tag_map": load_tag_map,
-            "morph_rules": load_morph_rules,
            "model": load_model,
        }
        util.from_disk(path, deserialize, exclude)
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -220,7 +220,6 @@ class ConfigSchemaNlp(BaseModel):
    lang: StrictStr = Field(..., title="The base language to use")
    pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
    tokenizer: Callable = Field(..., title="The tokenizer to use")
-    lemmatizer: Callable = Field(..., title="The lemmatizer to use")
    load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
    before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
    after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -201,7 +201,7 @@ def ru_tokenizer():
@pytest.fixture
 def ru_lemmatizer():
    pytest.importorskip("pymorphy2")
-    return get_lang_class("ru")().vocab.morphology.lemmatizer
+    return get_lang_class("ru")().add_pipe("lemmatizer")


@pytest.fixture(scope="session")
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@ -1,21 +1,12 @@
 import pytest
 from spacy.vocab import Vocab
 from spacy.tokens import Doc
-from spacy.lemmatizer import Lemmatizer
-from spacy.lookups import Lookups
 from spacy import util


@pytest.fixture
-def lemmatizer():
-    lookups = Lookups()
-    lookups.add_table("lemma_lookup", {"dogs": "dog", "boxen": "box", "mice": "mouse"})
-    return Lemmatizer(lookups)
-
-
-@pytest.fixture
-def vocab(lemmatizer):
-    return Vocab(lemmatizer=lemmatizer)
+def vocab():
+    return Vocab()


 def test_empty_doc(vocab):
@ -30,14 +21,6 @@ def test_single_word(vocab):
    assert doc.text == "a"


-def test_lookup_lemmatization(vocab):
-    doc = Doc(vocab, words=["dogs", "dogses"])
-    assert doc[0].text == "dogs"
-    assert doc[0].lemma_ == "dog"
-    assert doc[1].text == "dogses"
-    assert doc[1].lemma_ == "dogses"
-
-
 def test_create_from_words_and_text(vocab):
    # no whitespace in words
    words = ["'", "dogs", "'", "run"]
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@ -1,23 +1,17 @@
 import pytest
-from spacy.symbols import POS, PRON, VERB


@pytest.fixture
 def i_has(en_tokenizer):
    doc = en_tokenizer("I has")
-    tag_map = {
-        "PRP": {POS: PRON, "PronType": "prs"},
-        "VBZ": {
-            POS: VERB,
-            "VerbForm": "fin",
-            "Tense": "pres",
-            "Number": "sing",
-            "Person": "three",
-        },
+    doc[0].morph_ = {"PronType": "prs"}
+    doc[1].morph_ = {
+        "VerbForm": "fin",
+        "Tense": "pres",
+        "Number": "sing",
+        "Person": "three",
    }
-    en_tokenizer.vocab.morphology.load_tag_map(tag_map)
-    doc[0].tag_ = "PRP"
-    doc[1].tag_ = "VBZ"
+
    return doc


--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -124,7 +124,6 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
    assert doc[0].text == "The players"
    assert doc[0].tag_ == "NN"
    assert doc[0].pos_ == "NOUN"
-    assert doc[0].lemma_ == "The players"
    doc = get_doc(
        tokens.vocab,
        words=[t.text for t in tokens],
@ -143,11 +142,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
    assert doc[0].text == "The players"
    assert doc[0].tag_ == "NN"
    assert doc[0].pos_ == "NOUN"
-    assert doc[0].lemma_ == "The players"
    assert doc[1].text == "start ."
    assert doc[1].tag_ == "VBZ"
    assert doc[1].pos_ == "VERB"
-    assert doc[1].lemma_ == "start ."


 def test_doc_retokenize_spans_merge_heads(en_tokenizer):
--- a/spacy/tests/lang/en/test_tagger.py
+++ b/spacy/tests/lang/en/test_tagger.py
@ -1,21 +0,0 @@
-from spacy.symbols import POS, PRON, VERB, DET, NOUN, PUNCT
-from ...util import get_doc
-
-
-def test_en_tagger_load_morph_exc(en_tokenizer):
-    text = "I like his style."
-    tags = ["PRP", "VBP", "PRP$", "NN", "."]
-    tag_map = {
-        "PRP": {POS: PRON},
-        "VBP": {POS: VERB},
-        "PRP$": {POS: DET},
-        "NN": {POS: NOUN},
-        ".": {POS: PUNCT},
-    }
-    morph_exc = {"VBP": {"like": {"lemma": "luck"}}}
-    en_tokenizer.vocab.morphology.load_tag_map(tag_map)
-    en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags)
-    assert doc[1].tag_ == "VBP"
-    assert doc[1].lemma_ == "luck"
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@ -3,15 +3,16 @@ import pytest
 from ...util import get_doc


-@pytest.mark.xfail(reason="TODO: investigate why lemmatizer fails here")
-def test_ru_doc_lemmatization(ru_tokenizer):
+def test_ru_doc_lemmatization(ru_lemmatizer):
    words = ["мама", "мыла", "раму"]
-    tags = [
-        "NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
-        "VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
-        "NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
+    pos = ["NOUN", "VERB", "NOUN"]
+    morphs = [
+        "Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
+        "Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
+        "Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
    ]
-    doc = get_doc(ru_tokenizer.vocab, words=words, tags=tags)
+    doc = get_doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
+    doc = ru_lemmatizer(doc)
    lemmas = [token.lemma_ for token in doc]
    assert lemmas == ["мама", "мыть", "рама"]

@ -27,43 +28,51 @@ def test_ru_doc_lemmatization(ru_tokenizer):
    ],
 )
 def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
-    assert sorted(ru_lemmatizer.noun(text)) == lemmas
+    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
+    result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
+    assert sorted(result_lemmas) == lemmas


@pytest.mark.parametrize(
-    "text,pos,morphology,lemma",
+    "text,pos,morph,lemma",
    [
-        ("рой", "NOUN", None, "рой"),
-        ("рой", "VERB", None, "рыть"),
-        ("клей", "NOUN", None, "клей"),
-        ("клей", "VERB", None, "клеить"),
-        ("три", "NUM", None, "три"),
-        ("кос", "NOUN", {"Number": "Sing"}, "кос"),
-        ("кос", "NOUN", {"Number": "Plur"}, "коса"),
-        ("кос", "ADJ", None, "косой"),
-        ("потом", "NOUN", None, "пот"),
-        ("потом", "ADV", None, "потом"),
+        ("рой", "NOUN", "", "рой"),
+        ("рой", "VERB", "", "рыть"),
+        ("клей", "NOUN", "", "клей"),
+        ("клей", "VERB", "", "клеить"),
+        ("три", "NUM", "", "три"),
+        ("кос", "NOUN", "Number=Sing", "кос"),
+        ("кос", "NOUN", "Number=Plur", "коса"),
+        ("кос", "ADJ", "", "косой"),
+        ("потом", "NOUN", "", "пот"),
+        ("потом", "ADV", "", "потом"),
    ],
 )
 def test_ru_lemmatizer_works_with_different_pos_homonyms(
-    ru_lemmatizer, text, pos, morphology, lemma
+    ru_lemmatizer, text, pos, morph, lemma
 ):
-    assert ru_lemmatizer(text, pos, morphology) == [lemma]
+    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
+    result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
+    assert result_lemmas == [lemma]


@pytest.mark.parametrize(
-    "text,morphology,lemma",
+    "text,morph,lemma",
    [
-        ("гвоздики", {"Gender": "Fem"}, "гвоздика"),
-        ("гвоздики", {"Gender": "Masc"}, "гвоздик"),
-        ("вина", {"Gender": "Fem"}, "вина"),
-        ("вина", {"Gender": "Neut"}, "вино"),
+        ("гвоздики", "Gender=Fem", "гвоздика"),
+        ("гвоздики", "Gender=Masc", "гвоздик"),
+        ("вина", "Gender=Fem", "вина"),
+        ("вина", "Gender=Neut", "вино"),
    ],
 )
-def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma):
-    assert ru_lemmatizer.noun(text, morphology) == [lemma]
+def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma):
+    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
+    result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
+    assert result_lemmas == [lemma]


 def test_ru_lemmatizer_punct(ru_lemmatizer):
-    assert ru_lemmatizer.punct("«") == ['"']
-    assert ru_lemmatizer.punct("»") == ['"']
+    doc = get_doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
+    assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
+    doc = get_doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
+    assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
--- a/spacy/tests/lang/test_lemmatizers.py
+++ b/spacy/tests/lang/test_lemmatizers.py
@ -0,0 +1,34 @@
+import pytest
+from spacy import registry
+from spacy.lookups import Lookups
+from spacy.util import get_lang_class
+
+
+# fmt: off
+# Only include languages with no external dependencies
+# excluded: ru, uk
+# excluded for custom tables: pl
+LANGUAGES = ["el", "en", "fr", "nl"]
+# fmt: on
+
+
+@pytest.mark.parametrize("lang", LANGUAGES)
+def test_lemmatizer_initialize(lang, capfd):
+    @registry.assets("lemmatizer_init_lookups")
+    def lemmatizer_init_lookups():
+        lookups = Lookups()
+        lookups.add_table("lemma_lookup", {"cope": "cope"})
+        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
+        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
+        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
+        return lookups
+
+    """Test that languages can be initialized."""
+    nlp = get_lang_class(lang)()
+    nlp.add_pipe(
+        "lemmatizer", config={"lookups": {"@assets": "lemmatizer_init_lookups"}}
+    )
+    # Check for stray print statements (see #3342)
+    doc = nlp("test")  # noqa: F841
+    captured = capfd.readouterr()
+    assert not captured.out
--- a/spacy/tests/morphology/test_morph_features.py
+++ b/spacy/tests/morphology/test_morph_features.py
@ -1,14 +1,11 @@
 import pytest
 from spacy.morphology import Morphology
 from spacy.strings import StringStore, get_string_id
-from spacy.lemmatizer import Lemmatizer
-from spacy.lookups import Lookups


@pytest.fixture
 def morphology():
-    lemmatizer = Lemmatizer(Lookups())
-    return Morphology(StringStore(), {}, lemmatizer)
+    return Morphology(StringStore())


 def test_init(morphology):
--- a/spacy/tests/morphology/test_morph_pickle.py
+++ b/spacy/tests/morphology/test_morph_pickle.py
@ -2,21 +2,18 @@ import pytest
 import pickle
 from spacy.morphology import Morphology
 from spacy.strings import StringStore
-from spacy.lemmatizer import Lemmatizer
-from spacy.lookups import Lookups


@pytest.fixture
 def morphology():
-    tag_map = {"A": {"POS": "X"}, "B": {"POS": "NOUN"}}
-    exc = {"A": {"a": {"POS": "VERB"}}}
-    lemmatizer = Lemmatizer(Lookups())
-    return Morphology(StringStore(), tag_map, lemmatizer, exc=exc)
+    morphology = Morphology(StringStore())
+    morphology.add("Feat1=Val1|Feat2=Val2")
+    morphology.add("Feat3=Val3|Feat4=Val4")
+    return morphology


 def test_morphology_pickle_roundtrip(morphology):
    b = pickle.dumps(morphology)
    reloaded_morphology = pickle.loads(b)
-
-    assert morphology.tag_map == reloaded_morphology.tag_map
-    assert morphology.exc == reloaded_morphology.exc
+    assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2"
+    assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4"
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -82,10 +82,10 @@ def test_parser_merge_pp(en_tokenizer):
    text = "A phrase with another phrase occurs"
    heads = [1, 4, -1, 1, -2, 0]
    deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"]
-    tags = ["DT", "NN", "IN", "DT", "NN", "VBZ"]
+    pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"]
    tokens = en_tokenizer(text)
    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags
+        tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos,
    )
    with doc.retokenize() as retokenizer:
        for np in doc.noun_chunks:
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@ -0,0 +1,109 @@
+import pytest
+
+from spacy import util, registry
+from spacy.lang.en import English
+from spacy.lookups import Lookups, load_lookups
+
+from ..util import make_tempdir
+
+
+@pytest.fixture
+def nlp():
+    return English()
+
+
+@pytest.fixture
+def lemmatizer(nlp):
+    @registry.assets("cope_lookups")
+    def cope_lookups():
+        lookups = Lookups()
+        lookups.add_table("lemma_lookup", {"cope": "cope"})
+        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
+        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
+        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
+        return lookups
+
+    lemmatizer = nlp.add_pipe(
+        "lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
+    )
+    return lemmatizer
+
+
+def test_lemmatizer_init(nlp):
+    @registry.assets("cope_lookups")
+    def cope_lookups():
+        lookups = Lookups()
+        lookups.add_table("lemma_lookup", {"cope": "cope"})
+        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
+        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
+        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
+        return lookups
+
+    lemmatizer = nlp.add_pipe(
+        "lemmatizer", config={"mode": "lookup", "lookups": {"@assets": "cope_lookups"}}
+    )
+    assert isinstance(lemmatizer.lookups, Lookups)
+    assert lemmatizer.mode == "lookup"
+    # replace any tables from spacy-lookups-data
+    lemmatizer.lookups = Lookups()
+    doc = nlp("coping")
+    # lookup with no tables sets text as lemma
+    assert doc[0].lemma_ == "coping"
+
+    nlp.remove_pipe("lemmatizer")
+
+    @registry.assets("empty_lookups")
+    def empty_lookups():
+        return Lookups()
+
+    with pytest.raises(ValueError):
+        nlp.add_pipe(
+            "lemmatizer",
+            config={"mode": "lookup", "lookups": {"@assets": "empty_lookups"}},
+        )
+
+
+def test_lemmatizer_config(nlp, lemmatizer):
+    doc = nlp.make_doc("coping")
+    doc[0].pos_ = "VERB"
+    assert doc[0].lemma_ == ""
+    doc = lemmatizer(doc)
+    assert doc[0].text == "coping"
+    assert doc[0].lemma_ == "cope"
+
+    doc = nlp.make_doc("coping")
+    doc[0].pos_ = "VERB"
+    assert doc[0].lemma_ == ""
+    doc = lemmatizer(doc)
+    assert doc[0].text == "coping"
+    assert doc[0].lemma_ == "cope"
+
+
+def test_lemmatizer_serialize(nlp, lemmatizer):
+    @registry.assets("cope_lookups")
+    def cope_lookups():
+        lookups = Lookups()
+        lookups.add_table("lemma_lookup", {"cope": "cope"})
+        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
+        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
+        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
+        return lookups
+
+    nlp2 = English()
+    lemmatizer2 = nlp2.add_pipe(
+        "lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
+    )
+    lemmatizer2.from_bytes(lemmatizer.to_bytes())
+    assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
+    assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2.make_doc("coping")
+        doc2[0].pos_ = "VERB"
+        assert doc2[0].lemma_ == ""
+        doc2 = lemmatizer(doc2)
+        assert doc2[0].text == "coping"
+        assert doc2[0].lemma_ == "cope"
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@ -23,13 +23,12 @@ def test_tagger_begin_training_tag_map():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    orig_tag_count = len(tagger.labels)
-    tagger.add_label("A", {"POS": "NOUN"})
+    tagger.add_label("A")
    nlp.begin_training()
-    assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN}
    assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)


-TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
+TAGS = ("N", "V", "J")

 MORPH_RULES = {"V": {"like": {"lemma": "luck"}}}

@ -42,15 +41,12 @@ TRAIN_DATA = [
 def test_overfitting_IO():
    # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
    nlp = English()
-    nlp.vocab.morphology.load_tag_map(TAG_MAP)
-    nlp.vocab.morphology.load_morph_exceptions(MORPH_RULES)
-    tagger = nlp.add_pipe("tagger", config={"set_morphology": True})
-    nlp.vocab.morphology.load_tag_map(TAG_MAP)
+    tagger = nlp.add_pipe("tagger")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    for tag, values in TAG_MAP.items():
-        tagger.add_label(tag, values)
+    for tag in TAGS:
+        tagger.add_label(tag)
    optimizer = nlp.begin_training()

    for i in range(50):
@ -65,7 +61,6 @@ def test_overfitting_IO():
    assert doc[1].tag_ is "V"
    assert doc[2].tag_ is "J"
    assert doc[3].tag_ is "N"
-    assert doc[1].lemma_ == "luck"

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
@ -76,4 +71,3 @@ def test_overfitting_IO():
        assert doc2[1].tag_ is "V"
        assert doc2[2].tag_ is "J"
        assert doc2[3].tag_ is "N"
-        assert doc[1].lemma_ == "luck"
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@ -8,10 +8,8 @@ from spacy.attrs import IS_PUNCT, ORTH, LOWER
 from spacy.symbols import POS, VERB
 from spacy.vocab import Vocab
 from spacy.lang.en import English
-from spacy.lemmatizer import Lemmatizer
 from spacy.lookups import Lookups
 from spacy.tokens import Doc, Span
-from spacy.lang.en.lemmatizer import is_base_form

 from ..util import get_doc, make_tempdir

@ -157,16 +155,15 @@ def test_issue590(en_vocab):
    assert len(matches) == 2


+@pytest.mark.skip(reason="Old vocab-based lemmatization")
 def test_issue595():
    """Test lemmatization of base forms"""
    words = ["Do", "n't", "feed", "the", "dog"]
-    tag_map = {"VB": {POS: VERB, "VerbForm": "inf"}}
    lookups = Lookups()
    lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
    lookups.add_table("lemma_index", {"verb": {}})
    lookups.add_table("lemma_exc", {"verb": {}})
-    lemmatizer = Lemmatizer(lookups, is_base_form=is_base_form)
-    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
+    vocab = Vocab()
    doc = Doc(vocab, words=words)
    doc[2].tag_ = "VB"
    assert doc[2].text == "feed"
@ -389,6 +386,7 @@ def test_issue891(en_tokenizer, text):
    assert tokens[1].text == "/"


+@pytest.mark.skip(reason="Old vocab-based lemmatization")
@pytest.mark.parametrize(
    "text,tag,lemma",
    [("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")],
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ b/spacy/tests/regression/test_issue1001-1500.py
@ -6,7 +6,6 @@ from spacy.lang.en import English
 from spacy.lang.lex_attrs import LEX_ATTRS
 from spacy.matcher import Matcher
 from spacy.tokenizer import Tokenizer
-from spacy.lemmatizer import Lemmatizer
 from spacy.lookups import Lookups
 from spacy.symbols import ORTH, LEMMA, POS, VERB

@ -57,6 +56,7 @@ def test_issue1242():
    assert len(docs[1]) == 1


+@pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases")
 def test_issue1250():
    """Test cached special cases."""
    special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
@ -87,20 +87,6 @@ def test_issue1375():
    assert doc[1].nbor(1).text == "2"


-def test_issue1387():
-    tag_map = {"VBG": {POS: VERB, "VerbForm": "part"}}
-    lookups = Lookups()
-    lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
-    lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
-    lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
-    lemmatizer = Lemmatizer(lookups)
-    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
-    doc = Doc(vocab, words=["coping"])
-    doc[0].tag_ = "VBG"
-    assert doc[0].text == "coping"
-    assert doc[0].lemma_ == "cope"
-
-
 def test_issue1434():
    """Test matches occur when optional element at end of short doc."""
    pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -130,8 +130,6 @@ def test_issue1727():
    vectors = Vectors(data=data, keys=["I", "am", "Matt"])
    tagger = nlp.create_pipe("tagger")
    tagger.add_label("PRP")
-    with pytest.warns(UserWarning):
-        tagger.begin_training()
    assert tagger.cfg.get("pretrained_dims", 0) == 0
    tagger.vocab.vectors = vectors
    with make_tempdir() as path:
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -19,8 +19,8 @@ def test_issue2564():
    """Test the tagger sets is_tagged correctly when used via Language.pipe."""
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
-    with pytest.warns(UserWarning):
-        tagger.begin_training()  # initialise weights
+    tagger.add_label("A")
+    tagger.begin_training()
    doc = nlp("hello world")
    assert doc.is_tagged
    docs = nlp.pipe(["hello", "world"])
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -241,11 +241,11 @@ def test_issue3449():
    assert t3[5].text == "I"


-@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3456():
    # this crashed because of a padding error in layer.ops.unflatten in thinc
    nlp = English()
-    nlp.add_pipe("tagger")
+    tagger = nlp.add_pipe("tagger")
+    tagger.add_label("A")
    nlp.begin_training()
    list(nlp.pipe(["hi", ""]))

--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -149,13 +149,15 @@ def test_issue3540(en_vocab):
    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.text for token in doc] == gold_text
    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
+    for i, lemma in enumerate(gold_lemma):
+        doc[i].lemma_ = lemma
    assert [token.lemma_ for token in doc] == gold_lemma
    vectors_1 = [token.vector for token in doc]
    assert len(vectors_1) == len(doc)

    with doc.retokenize() as retokenizer:
        heads = [(doc[3], 1), doc[2]]
-        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
+        attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]}
        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)

    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -271,6 +271,7 @@ def test_issue4267():
        assert token.ent_iob == 2


+@pytest.mark.skip(reason="lemmatizer lookups no longer in vocab")
 def test_issue4272():
    """Test that lookup table can be accessed from Token.lemma if no POS tags
    are available."""
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -62,8 +62,7 @@ def tagger():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    with pytest.warns(UserWarning):
-        tagger.begin_training(pipeline=nlp.pipeline)
+    tagger.begin_training(pipeline=nlp.pipeline)
    return tagger


--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@ -44,8 +44,8 @@ def blank_parser(en_vocab):
 def taggers(en_vocab):
    cfg = {"model": DEFAULT_TAGGER_MODEL}
    model = registry.make_from_config(cfg, validate=True)["model"]
-    tagger1 = Tagger(en_vocab, model, set_morphology=True)
-    tagger2 = Tagger(en_vocab, model, set_morphology=True)
+    tagger1 = Tagger(en_vocab, model)
+    tagger2 = Tagger(en_vocab, model)
    return tagger1, tagger2


@ -125,8 +125,8 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
        tagger2.to_disk(file_path2)
        cfg = {"model": DEFAULT_TAGGER_MODEL}
        model = registry.make_from_config(cfg, validate=True)["model"]
-        tagger1_d = Tagger(en_vocab, model, set_morphology=True).from_disk(file_path1)
-        tagger2_d = Tagger(en_vocab, model, set_morphology=True).from_disk(file_path2)
+        tagger1_d = Tagger(en_vocab, model).from_disk(file_path1)
+        tagger2_d = Tagger(en_vocab, model).from_disk(file_path2)
        assert tagger1_d.to_bytes() == tagger2_d.to_bytes()


--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@ -8,7 +8,6 @@ from ..util import make_tempdir

 test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
 test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
-default_strings = ("_SP", "POS=SPACE")


@pytest.mark.parametrize("text", ["rat"])
@ -34,10 +33,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
    assert vocab1.to_bytes() == vocab1_b
    new_vocab1 = Vocab().from_bytes(vocab1_b)
    assert new_vocab1.to_bytes() == vocab1_b
-    assert len(new_vocab1.strings) == len(strings1) + 2  # adds _SP and POS=SPACE
-    assert sorted([s for s in new_vocab1.strings]) == sorted(
-        strings1 + list(default_strings)
-    )
+    assert len(new_vocab1.strings) == len(strings1)
+    assert sorted([s for s in new_vocab1.strings]) == sorted(strings1)


@pytest.mark.parametrize("strings1,strings2", test_strings)
@ -52,16 +49,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
        vocab1_d = Vocab().from_disk(file_path1)
        vocab2_d = Vocab().from_disk(file_path2)
        # check strings rather than lexemes, which are only reloaded on demand
-        assert strings1 == [s for s in vocab1_d.strings if s not in default_strings]
-        assert strings2 == [s for s in vocab2_d.strings if s not in default_strings]
+        assert strings1 == [s for s in vocab1_d.strings]
+        assert strings2 == [s for s in vocab2_d.strings]
        if strings1 == strings2:
-            assert [s for s in vocab1_d.strings if s not in default_strings] == [
-                s for s in vocab2_d.strings if s not in default_strings
-            ]
+            assert [s for s in vocab1_d.strings] == [s for s in vocab2_d.strings]
        else:
-            assert [s for s in vocab1_d.strings if s not in default_strings] != [
-                s for s in vocab2_d.strings if s not in default_strings
-            ]
+            assert [s for s in vocab1_d.strings] != [s for s in vocab2_d.strings]


@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@ -80,7 +73,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
    # Reported in #2153
    vocab = Vocab(strings=strings)
    vocab.from_bytes(vocab.to_bytes())
-    assert len(vocab.strings) == len(strings) + 2  # adds _SP and POS=SPACE
+    assert len(vocab.strings) == len(strings)


@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
--- a/spacy/tests/test_lemmatizer.py
+++ b/spacy/tests/test_lemmatizer.py
@ -1,64 +0,0 @@
-import pytest
-from spacy.tokens import Doc
-from spacy.language import Language
-from spacy.lookups import Lookups
-from spacy.lemmatizer import Lemmatizer
-
-
-@pytest.mark.skip(reason="We probably don't want to support this anymore in v3?")
-def test_lemmatizer_reflects_lookups_changes():
-    """Test for an issue that'd cause lookups available in a model loaded from
-    disk to not be reflected in the lemmatizer."""
-    nlp = Language()
-    assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "foo"
-    table = nlp.vocab.lookups.add_table("lemma_lookup")
-    table["foo"] = "bar"
-    assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "bar"
-    table = nlp.vocab.lookups.get_table("lemma_lookup")
-    table["hello"] = "world"
-    # The update to the table should be reflected in the lemmatizer
-    assert Doc(nlp.vocab, words=["hello"])[0].lemma_ == "world"
-    new_nlp = Language()
-    table = new_nlp.vocab.lookups.add_table("lemma_lookup")
-    table["hello"] = "hi"
-    assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "hi"
-    nlp_bytes = nlp.to_bytes()
-    new_nlp.from_bytes(nlp_bytes)
-    # Make sure we have the previously saved lookup table
-    assert "lemma_lookup" in new_nlp.vocab.lookups
-    assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2
-    assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world"
-    assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar"
-    assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world"
-
-
-def test_tagger_warns_no_lookups():
-    nlp = Language()
-    nlp.vocab.lookups = Lookups()
-    assert not len(nlp.vocab.lookups)
-    tagger = nlp.add_pipe("tagger")
-    with pytest.warns(UserWarning):
-        tagger.begin_training()
-    with pytest.warns(UserWarning):
-        nlp.begin_training()
-    nlp.vocab.lookups.add_table("lemma_lookup")
-    nlp.vocab.lookups.add_table("lexeme_norm")
-    nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
-    with pytest.warns(None) as record:
-        nlp.begin_training()
-        assert not record.list
-
-
-def test_lemmatizer_without_is_base_form_implementation():
-    # Norwegian example from #5658
-    lookups = Lookups()
-    lookups.add_table("lemma_rules", {"noun": []})
-    lookups.add_table("lemma_index", {"noun": {}})
-    lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}})
-
-    lemmatizer = Lemmatizer(lookups, is_base_form=None)
-    assert lemmatizer(
-        "Formuesskatten",
-        "noun",
-        {"Definite": "def", "Gender": "masc", "Number": "sing"},
-    ) == ["formuesskatt"]
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -112,16 +112,15 @@ def test_tokenizer_validate_special_case(tokenizer, text, tokens):


@pytest.mark.parametrize(
-    "text,tokens", [("lorem", [{"orth": "lo", "tag": "NN"}, {"orth": "rem"}])]
+    "text,tokens", [("lorem", [{"orth": "lo", "norm": "LO"}, {"orth": "rem"}])]
 )
 def test_tokenizer_add_special_case_tag(text, tokens):
-    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
+    vocab = Vocab()
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]["orth"]
-    assert doc[0].tag_ == tokens[0]["tag"]
-    assert doc[0].pos_ == "NOUN"
+    assert doc[0].norm_ == tokens[0]["norm"]
    assert doc[1].text == tokens[1]["orth"]


--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -11,7 +11,7 @@ from .span cimport Span
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..structs cimport LexemeC, TokenC
-from ..attrs cimport TAG, MORPH
+from ..attrs cimport MORPH
 from ..vocab cimport Vocab

 from .underscore import is_writable_attr
@ -365,8 +365,6 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
                    doc[token_index + i]._.set(ext_attr_key, ext_attr_value)
            # NB: We need to call get_string_id here because only the keys are
            # "intified" (since we support "KEY": [value, value] syntax here).
-            elif attr_name == TAG:
-                doc.vocab.morphology.assign_tag(token, get_string_id(attr_value))
            else:
                # Set attributes on both token and lexeme to take care of token
                # attribute vs. lexical attribute without having to enumerate
@ -431,8 +429,6 @@ def set_token_attrs(Token py_token, attrs):
        if attr_name == "_":  # Set extension attributes
            for ext_attr_key, ext_attr_value in attr_value.items():
                py_token._.set(ext_attr_key, ext_attr_value)
-        elif attr_name == TAG:
-            doc.vocab.morphology.assign_tag(token, attr_value)
        else:
            # Set attributes on both token and lexeme to take care of token
            # attribute vs. lexical attribute without having to enumerate
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -832,13 +832,6 @@ cdef class Doc:
                            rel_head_index=abs_head_index-i
                        )
                    )
-        # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
-        if TAG in attrs:
-            col = attrs.index(TAG)
-            for i in range(length):
-                value = values[col * stride + i]
-                if value != 0:
-                    self.vocab.morphology.assign_tag(&tokens[i], value)
        # Verify ENT_IOB are proper integers
        if ENT_IOB in attrs:
            iob_strings = Token.iob_strings()
@ -857,12 +850,11 @@ cdef class Doc:
        for i in range(length):
            token = &self.c[i]
            for j in range(n_attrs):
-                if attr_ids[j] != TAG:
-                    value = values[j * stride + i]
-                    if attr_ids[j] == MORPH:
-                        # add morph to morphology table
-                        self.vocab.morphology.add(self.vocab.strings[value])
-                    Token.set_struct_attr(token, attr_ids[j], value)
+                value = values[j * stride + i]
+                if attr_ids[j] == MORPH:
+                    # add morph to morphology table
+                    self.vocab.morphology.add(self.vocab.strings[value])
+                Token.set_struct_attr(token, attr_ids[j], value)
        # Set flags
        self.is_parsed = bool(self.is_parsed or HEAD in attrs)
        self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -332,11 +332,7 @@ cdef class Token:
            inflectional suffixes.
        """
        def __get__(self):
-            if self.c.lemma == 0:
-                lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
-                return self.vocab.strings[lemma_]
-            else:
-                return self.c.lemma
+            return self.c.lemma

        def __set__(self, attr_t lemma):
            self.c.lemma = lemma
@ -355,7 +351,7 @@ cdef class Token:
            return self.c.tag

        def __set__(self, attr_t tag):
-            self.vocab.morphology.assign_tag(self.c, tag)
+            self.c.tag = tag

    property dep:
        """RETURNS (uint64): ID of syntactic dependency label."""
@ -888,10 +884,7 @@ cdef class Token:
            with no inflectional suffixes.
        """
        def __get__(self):
-            if self.c.lemma == 0:
-                return self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
-            else:
-                return self.vocab.strings[self.c.lemma]
+            return self.vocab.strings[self.c.lemma]

        def __set__(self, unicode lemma_):
            self.c.lemma = self.vocab.strings.add(lemma_)
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -9,11 +9,10 @@ from .lexeme cimport EMPTY_LEXEME, OOV_RANK
 from .lexeme cimport Lexeme
 from .typedefs cimport attr_t
 from .tokens.token cimport Token
-from .attrs cimport LANG, ORTH, TAG, POS
+from .attrs cimport LANG, ORTH

 from .compat import copy_reg
 from .errors import Errors
-from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs, NORM, IS_STOP
 from .vectors import Vectors
 from .util import registry
@ -23,7 +22,7 @@ from .lang.norm_exceptions import BASE_NORMS
 from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang


-def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True):
+def create_vocab(lang, defaults, vectors_name=None, load_data=True):
    # If the spacy-lookups-data package is installed, we pre-populate the lookups
    # with lexeme data, if available
    if load_data:
@ -43,7 +42,6 @@ def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=T
    )
    return Vocab(
        lex_attr_getters=lex_attrs,
-        lemmatizer=lemmatizer,
        lookups=lookups,
        writing_system=defaults.writing_system,
        get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
@ -58,17 +56,13 @@ cdef class Vocab:

    DOCS: https://spacy.io/api/vocab
    """
-    def __init__(self, lex_attr_getters=None, lemmatizer=None,
-                 strings=tuple(), lookups=None, tag_map={},
+    def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
                 oov_prob=-20., vectors_name=None, writing_system={},
                 get_noun_chunks=None, **deprecated_kwargs):
        """Create the vocabulary.

        lex_attr_getters (dict): A dictionary mapping attribute IDs to
            functions to compute them. Defaults to `None`.
-        tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained
-            parts-of-speech, and optionally morphological attributes.
-        lemmatizer (object): A lemmatizer. Defaults to `None`.
        strings (StringStore): StringStore that maps strings to integers, and
            vice versa.
        lookups (Lookups): Container for large lookup tables and dictionaries.
@ -78,8 +72,6 @@ cdef class Vocab:
        lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
        if lookups in (None, True, False):
            lookups = Lookups()
-        if lemmatizer in (None, True, False):
-            lemmatizer = Lemmatizer(lookups)
        self.cfg = {'oov_prob': oov_prob}
        self.mem = Pool()
        self._by_orth = PreshMap()
@ -89,7 +81,7 @@ cdef class Vocab:
            for string in strings:
                _ = self[string]
        self.lex_attr_getters = lex_attr_getters
-        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
+        self.morphology = Morphology(self.strings)
        self.vectors = Vectors(name=vectors_name)
        self.lookups = lookups
        self.writing_system = writing_system
@ -268,12 +260,6 @@ cdef class Vocab:
            # Set the special tokens up to have arbitrary attributes
            lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
            token.lex = lex
-            if TAG in props:
-                self.morphology.assign_tag(token, props[TAG])
-            elif POS in props:
-                # Don't allow POS to be set without TAG -- this causes problems,
-                # see #1773
-                props.pop(POS)
            for attr_id, value in props.items():
                Token.set_struct_attr(token, attr_id, value)
                # NORM is the only one that overlaps between the two
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@ -1,102 +1,263 @@
 ---
 title: Lemmatizer
-teaser: Assign the base forms of words
 tag: class
-source: spacy/lemmatizer.py
+source: spacy/pipeline/lemmatizer.py
+new: 3
+teaser: 'Pipeline component for lemmatization'
+api_base_class: /api/pipe
+api_string_name: lemmatizer
+api_trainable: false
 ---

-<!-- TODO: rewrite once it's converted to pipe -->
+## Config and implementation

-The `Lemmatizer` supports simple part-of-speech-sensitive suffix rules and
-lookup tables.
+The default config is defined by the pipeline component factory and describes
+how the component should be configured. You can override its settings via the
+`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
+[`config.cfg` for training](/usage/training#config).
+
+For examples of the lookups data formats used by the lookup and rule-based
+lemmatizers, see the
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
+
+> #### Example
+>
+> ```python
+> config = {"mode": "rule"}
+> nlp.add_pipe("lemmatizer", config=config)
+> ```
+
+| Setting     | Type                                       | Description                                                                                                                                                                    | Default    |
+| ----------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------- |
+| `mode`      | str                                        | The lemmatizer mode, e.g. "lookup" or "rule".                                                                                                                                  | `"lookup"` |
+| `lookups`   | [`Lookups`](/api/lookups)                  | The lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". If `None`, default tables are loaded from `spacy-lookups-data`. | `None`     |
+| `overwrite` | bool                                       | Whether to overwrite existing lemmas.                                                                                                                                          | `False`    |
+| `model`     | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use.                                                                                                                                     | `None`     |
+
+```python
+https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/lemmatizer.py
+```

 ## Lemmatizer.\_\_init\_\_ {#init tag="method"}

-Initialize a `Lemmatizer`. Typically, this happens under the hood within spaCy
-when a `Language` subclass and its `Vocab` is initialized.
-
 > #### Example
 >
 > ```python
-> from spacy.lemmatizer import Lemmatizer
-> from spacy.lookups import Lookups
-> lookups = Lookups()
-> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
-> lemmatizer = Lemmatizer(lookups)
-> ```
+> # Construction via add_pipe with default model
+> lemmatizer = nlp.add_pipe("lemmatizer")
 >
-> For examples of the data format, see the
-> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
+> # Construction via add_pipe with custom settings
+> config = {"mode": "rule", overwrite=True}
+> lemmatizer = nlp.add_pipe("lemmatizer", config=config)
+> ```

-| Name                                   | Type                      | Description                                                                                                               |
-| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
-| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. |
+Create a new pipeline instance. In your application, you would normally use a
+shortcut for this and instantiate the component using its string name and
+[`nlp.add_pipe`](/api/language#add_pipe).
+
+| Name           | Type                                       | Description                                                                                                                      |
+| -------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`        | [`Vocab`](/api/vocab)                      | The vocab.                                                                                                                       |
+| `model`        | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented).                                                                                                   |
+| `name`         | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                      |
+| _keyword-only_ |                                            |                                                                                                                                  |
+| mode           | str                                        | The lemmatizer mode, e.g. "lookup" or "rule". Defaults to "lookup".                                                              |
+| lookups        | [`Lookups`](/api/lookups)                  | A lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". Defaults to `None`. |
+| overwrite      | bool                                       | Whether to overwrite existing lemmas.                                                                                            |

 ## Lemmatizer.\_\_call\_\_ {#call tag="method"}

-Lemmatize a string.
+Apply the pipe to one document. The document is modified in place, and returned.
+This usually happens under the hood when the `nlp` object is called on a text
+and all pipeline components are applied to the `Doc` in order.

 > #### Example
 >
 > ```python
-> from spacy.lemmatizer import Lemmatizer
-> from spacy.lookups import Lookups
-> lookups = Lookups()
-> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
-> lemmatizer = Lemmatizer(lookups)
-> lemmas = lemmatizer("ducks", "NOUN")
-> assert lemmas == ["duck"]
+> doc = nlp("This is a sentence.")
+> lemmatizer = nlp.add_pipe("lemmatizer")
+> # This usually happens under the hood
+> processed = lemmatizer(doc)
 > ```

-| Name         | Type          | Description                                                                                              |
-| ------------ | ------------- | -------------------------------------------------------------------------------------------------------- |
-| `string`     | str           | The string to lemmatize, e.g. the token text.                                                            |
-| `univ_pos`   | str / int     | The token's universal part-of-speech tag.                                                                |
-| `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. |
-| **RETURNS**  | list          | The available lemmas for the string.                                                                     |
+| Name        | Type  | Description              |
+| ----------- | ----- | ------------------------ |
+| `doc`       | `Doc` | The document to process. |
+| **RETURNS** | `Doc` | The processed document.  |

-## Lemmatizer.lookup {#lookup tag="method" new="2"}
+## Lemmatizer.pipe {#pipe tag="method"}

-Look up a lemma in the lookup table, if available. If no lemma is found, the
+Apply the pipe to a stream of documents. This usually happens under the hood
+when the `nlp` object is called on a text and all pipeline components are
+applied to the `Doc` in order.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("lemmatizer")
+> for doc in lemmatizer.pipe(docs, batch_size=50):
+>     pass
+> ```
+
+| Name           | Type            | Description                                            |
+| -------------- | --------------- | ------------------------------------------------------ |
+| `stream`       | `Iterable[Doc]` | A stream of documents.                                 |
+| _keyword-only_ |                 |                                                        |
+| `batch_size`   | int             | The number of texts to buffer. Defaults to `128`.      |
+| **YIELDS**     | `Doc`           | Processed documents in the order of the original text. |
+
+## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
+
+Lemmatize a token using a lookup-based approach. If no lemma is found, the
 original string is returned. Languages can provide a
 [lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.

-> #### Example
->
-> ```python
-> lookups = Lookups()
-> lookups.add_table("lemma_lookup", {"going": "go"})
-> assert lemmatizer.lookup("going") == "go"
-> ```
+| Name        | Type                  | Description                           |
+| ----------- | --------------------- | ------------------------------------- |
+| `token`     | [`Token`](/api/token) | The token to lemmatize.               |
+| **RETURNS** | `List[str]`           | A list containing one or more lemmas. |

-| Name        | Type | Description                                                                                                 |
-| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- |
-| `string`    | str  | The string to look up.                                                                                      |
-| `orth`      | int  | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
-| **RETURNS** | str  | The lemma if the string was found, otherwise the original string.                                           |
+## Lemmatizer.rule_lemmatize {#rule_lemmatize tag="method"}
+
+Lemmatize a token using a rule-based approach. Typically relies on POS tags.
+
+| Name        | Type                  | Description                           |
+| ----------- | --------------------- | ------------------------------------- |
+| `token`     | [`Token`](/api/token) | The token to lemmatize.               |
+| **RETURNS** | `List[str]`           | A list containing one or more lemmas. |

 ## Lemmatizer.is_base_form {#is_base_form tag="method"}

 Check whether we're dealing with an uninflected paradigm, so we can avoid
 lemmatization entirely.

+| Name        | Type                  | Description                                                                                             |
+| ----------- | --------------------- | ------------------------------------------------------------------------------------------------------- |
+| `token`     | [`Token`](/api/token) | The token to analyze.                                                                                   |
+| **RETURNS** | bool                  | Whether the token's attributes (e.g., part-of-speech tag, morphological features) describe a base form. |
+
+## Lemmatizer.get_lookups_config {#get_lookups_config tag="classmethod"}
+
+Returns the lookups configuration settings for a given mode for use in
+[`Lemmatizer.load_lookups`](#load_lookups).
+
+| Name        | Type | Description                                       |
+| ----------- | ---- | ------------------------------------------------- |
+| `mode`      | str  | The lemmatizer mode.                              |
+| **RETURNS** | dict | The lookups configuration settings for this mode. |
+
+## Lemmatizer.load_lookups {#load_lookups tag="classmethod"}
+
+Load and validate lookups tables. If the provided lookups is `None`, load the
+default lookups tables according to the language and mode settings. Confirm that
+all required tables for the language and mode are present.
+
+| Name        | Type                      | Description                                                                  |
+| ----------- | ------------------------- | ---------------------------------------------------------------------------- |
+| `lang`      | str                       | The language.                                                                |
+| `mode`      | str                       | The lemmatizer mode.                                                         |
+| `lookups`   | [`Lookups`](/api/lookups) | The provided lookups, may be `None` if the default lookups should be loaded. |
+| **RETURNS** | [`Lookups`](/api/lookups) | The lookups object.                                                          |
+
+## Lemmatizer.to_disk {#to_disk tag="method"}
+
+Serialize the pipe to disk.
+
 > #### Example
 >
 > ```python
-> pos = "verb"
-> morph = {"VerbForm": "inf"}
-> is_base_form = lemmatizer.is_base_form(pos, morph)
-> assert is_base_form == True
+> lemmatizer = nlp.add_pipe("lemmatizer")
+> lemmatizer.to_disk("/path/to/lemmatizer")
 > ```

-| Name         | Type      | Description                                                                             |
-| ------------ | --------- | --------------------------------------------------------------------------------------- |
-| `univ_pos`   | str / int | The token's universal part-of-speech tag.                                               |
-| `morphology` | dict      | The token's morphological features.                                                     |
-| **RETURNS**  | bool      | Whether the token's part-of-speech tag and morphological features describe a base form. |
+| Name           | Type            | Description                                                                                                           |
+| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
+| `path`         | str / `Path`    | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| _keyword-only_ |                 |                                                                                                                       |
+| `exclude`      | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude.                                             |
+
+## Lemmatizer.from_disk {#from_disk tag="method"}
+
+Load the pipe from disk. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("lemmatizer")
+> lemmatizer.from_disk("/path/to/lemmatizer")
+> ```
+
+| Name           | Type            | Description                                                                |
+| -------------- | --------------- | -------------------------------------------------------------------------- |
+| `path`         | str / `Path`    | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| _keyword-only_ |                 |                                                                            |
+| `exclude`      | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude.  |
+| **RETURNS**    | `Lemmatizer`    | The modified `Lemmatizer` object.                                          |
+
+## Lemmatizer.to_bytes {#to_bytes tag="method"}
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("lemmatizer")
+> lemmatizer_bytes = lemmatizer.to_bytes()
+> ```
+
+Serialize the pipe to a bytestring.
+
+| Name           | Type            | Description                                                               |
+| -------------- | --------------- | ------------------------------------------------------------------------- |
+| _keyword-only_ |                 |                                                                           |
+| `exclude`      | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS**    | bytes           | The serialized form of the `Lemmatizer` object.                           |
+
+## Lemmatizer.from_bytes {#from_bytes tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> lemmatizer_bytes = lemmatizer.to_bytes()
+> lemmatizer = nlp.add_pipe("lemmatizer")
+> lemmatizer.from_bytes(lemmatizer_bytes)
+> ```
+
+| Name           | Type            | Description                                                               |
+| -------------- | --------------- | ------------------------------------------------------------------------- |
+| `bytes_data`   | bytes           | The data to load from.                                                    |
+| _keyword-only_ |                 |                                                                           |
+| `exclude`      | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS**    | `Lemmatizer`    | The `Lemmatizer` object.                                                  |
+
+## Lemmatizer.mode {#mode tag="property"}
+
+The lemmatizer mode.
+
+| Name        | Type  | Description          |
+| ----------- | ----- | -------------------- |
+| **RETURNS** | `str` | The lemmatizer mode. |

 ## Attributes {#attributes}

-| Name                                   | Type                      | Description                                                     |
-| -------------------------------------- | ------------------------- | --------------------------------------------------------------- |
-| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the rules and data, if available. |
+| Name      | Type                              | Description         |
+| --------- | --------------------------------- | ------------------- |
+| `vocab`   | The shared [`Vocab`](/api/vocab). |
+| `lookups` | [`Lookups`](/api/lookups)         | The lookups object. |
+
+## Serialization fields {#serialization-fields}
+
+During serialization, spaCy will export several data fields used to restore
+different aspects of the object. If needed, you can exclude them from
+serialization by passing in the string names via the `exclude` argument.
+
+> #### Example
+>
+> ```python
+> data = lemmatizer.to_disk("/path", exclude=["vocab"])
+> ```
+
+| Name      | Description                                          |
+| --------- | ---------------------------------------------------- |
+| `vocab`   | The shared [`Vocab`](/api/vocab).                    |
+| `lookups` | The lookups. You usually don't want to exclude this. |
--- a/website/docs/api/morphology.md
+++ b/website/docs/api/morphology.md
@ -11,22 +11,19 @@ this class.

 ## Morphology.\_\_init\_\_ {#init tag="method"}

-Create a Morphology object using the tag map, lemmatizer and exceptions.
+Create a Morphology object.

 > #### Example
 >
 > ```python
 > from spacy.morphology import Morphology
 >
-> morphology = Morphology(strings, tag_map, lemmatizer)
+> morphology = Morphology(strings)
 > ```

-| Name         | Type              | Description                                                                                                |
-| ------------ | ----------------- | ---------------------------------------------------------------------------------------------------------- |
-| `strings`    | `StringStore`     | The string store.                                                                                          |
-| `tag_map`    | `Dict[str, Dict]` | The tag map.                                                                                               |
-| `lemmatizer` | `Lemmatizer`      | The lemmatizer.                                                                                            |
-| `exc`        | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` |
+| Name      | Type          | Description       |
+| --------- | ------------- | ----------------- |
+| `strings` | `StringStore` | The string store. |

 ## Morphology.add {#add tag="method"}

@ -62,52 +59,6 @@ Get the FEATS string for the hash of the morphological analysis.
 | ------- | ---- | --------------------------------------- |
 | `morph` | int  | The hash of the morphological analysis. |

-## Morphology.load_tag_map {#load_tag_map tag="method"}
-
-Replace the current tag map with the provided tag map.
-
-| Name      | Type              | Description  |
-| --------- | ----------------- | ------------ |
-| `tag_map` | `Dict[str, Dict]` | The tag map. |
-
-## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"}
-
-Replace the current morphological exceptions with the provided exceptions.
-
-| Name          | Type              | Description                   |
-| ------------- | ----------------- | ----------------------------- |
-| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. |
-
-## Morphology.add_special_case {#add_special_case tag="method"}
-
-Add a special-case rule to the morphological analyzer. Tokens whose tag and orth
-match the rule will receive the specified properties.
-
-> #### Example
->
-> ```python
-> attrs = {"POS": "DET", "Definite": "Def"}
-> morphology.add_special_case("DT", "the", attrs)
-> ```
-
-| Name       | Type | Description                                    |
-| ---------- | ---- | ---------------------------------------------- |
-| `tag_str`  | str  | The fine-grained tag.                          |
-| `orth_str` | str  | The token text.                                |
-| `attrs`    | dict | The features to assign for this token and tag. |
-
-## Morphology.exc {#exc tag="property"}
-
-The current morphological exceptions.
-
-| Name       | Type | Description                                         |
-| ---------- | ---- | --------------------------------------------------- |
-| **YIELDS** | dict | The current dictionary of morphological exceptions. |
-
-## Morphology.lemmatize {#lemmatize tag="method"}
-
-TODO
-
 ## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"}

 Convert a string FEATS representation to a dictionary of features and values in
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@ -47,7 +47,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx
 >
 > # Construction via create_pipe with custom model
 > config = {"model": {"@architectures": "my_tagger"}}
-> parser = nlp.add_pipe("tagger", config=config)
+> tagger = nlp.add_pipe("tagger", config=config)
 >
 > # Construction from class
 > from spacy.pipeline import Tagger
@ -285,16 +285,14 @@ Add a new label to the pipe.
 > #### Example
 >
 > ```python
-> from spacy.symbols import POS
 > tagger = nlp.add_pipe("tagger")
-> tagger.add_label("MY_LABEL", {POS: "NOUN"})
+> tagger.add_label("MY_LABEL")
 > ```

-| Name        | Type             | Description                                                     |
-| ----------- | ---------------- | --------------------------------------------------------------- |
-| `label`     | str              | The label to add.                                               |
-| `values`    | `Dict[int, str]` | Optional values to map to the label, e.g. a tag map dictionary. |
-| **RETURNS** | int              | `0` if the label is already present, otherwise `1`.             |
+| Name        | Type | Description                                         |
+| ----------- | ---- | --------------------------------------------------- |
+| `label`     | str  | The label to add.                                   |
+| **RETURNS** | int  | `0` if the label is already present, otherwise `1`. |

 ## Tagger.to_disk {#to_disk tag="method"}

@ -369,9 +367,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.

 ## Tagger.labels {#labels tag="property"}

-The labels currently added to the component. Note that even for a blank
-component, this will always include the built-in coarse-grained part-of-speech
-tags by default, e.g. `VERB`, `NOUN` and so on.
+The labels currently added to the component.

 > #### Example
 >
@ -396,9 +392,8 @@ serialization by passing in the string names via the `exclude` argument.
 > data = tagger.to_disk("/path", exclude=["vocab"])
 > ```

-| Name      | Description                                                                                |
-| --------- | ------------------------------------------------------------------------------------------ |
-| `vocab`   | The shared [`Vocab`](/api/vocab).                                                          |
-| `cfg`     | The config file. You usually don't want to exclude this.                                   |
-| `model`   | The binary model data. You usually don't want to exclude this.                             |
-| `tag_map` | The [tag map](/usage/adding-languages#tag-map) mapping fine-grained to coarse-grained tag. |
+| Name    | Description                                                    |
+| ------- | -------------------------------------------------------------- |
+| `vocab` | The shared [`Vocab`](/api/vocab).                              |
+| `cfg`   | The config file. You usually don't want to exclude this.       |
+| `model` | The binary model data. You usually don't want to exclude this. |
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@ -24,8 +24,6 @@ Create the vocabulary.
 | Name                                         | Type                 | Description                                                                                                                                                 |
 | -------------------------------------------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `lex_attr_getters`                           | dict                 | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`.                                                                        |
-| `tag_map`                                    | dict                 | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes.                                          |
-| `lemmatizer`                                 | object               | A lemmatizer. Defaults to `None`.                                                                                                                           |
 | `strings`                                    | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings.                                                 |
 | `lookups`                                    | `Lookups`            | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`.                                    |
 | `lookups_extra` <Tag variant="new">2.3</Tag> | `Lookups`            | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. |