Add Lemmatizer and simplify related components (#5848)

* Add Lemmatizer and simplify related components * Add `Lemmatizer` pipe with `lookup` and `rule` modes using the `Lookups` tables. * Reduce `Tagger` to a simple tagger that sets `Token.tag` (no pos or lemma) * Reduce `Morphology` to only keep track of morph tags (no tag map, lemmatizer, or morph rules) * Remove lemmatizer from `Vocab` * Adjust many many tests Differences: * No default lookup lemmas * No special treatment of TAG in `from_array` and similar required * Easier to modify labels in a `Tagger` * No extra strings added from morphology / tag map * Fix test * Initial fix for Lemmatizer config/serialization * Adjust init test to be more generic * Adjust init test to force empty Lookups * Add simple cache to rule-based lemmatizer * Convert language-specific lemmatizers Convert language-specific lemmatizers to component lemmatizers. Remove previous lemmatizer class. * Fix French and Polish lemmatizers * Remove outdated UPOS conversions * Update Russian lemmatizer init in tests * Add minimal init/run tests for custom lemmatizers * Add option to overwrite existing lemmas * Update mode setting, lookup loading, and caching * Make `mode` an immutable property * Only enforce strict `load_lookups` for known supported modes * Move caching into individual `_lemmatize` methods * Implement strict when lang is not found in lookups * Fix tables/lookups in make_lemmatizer * Reallow provided lookups and allow for stricter checks * Add lookups asset to all Lemmatizer pipe tests * Rename lookups in lemmatizer init test * Clean up merge * Refactor lookup table loading * Add helper from `load_lemmatizer_lookups` that loads required and optional lookups tables based on settings provided by a config. Additional slight refactor of lookups: * Add `Lookups.set_table` to set a table from a provided `Table` * Reorder class definitions to be able to specify type as `Table` * Move registry assets into test methods * Refactor lookups tables config Use class methods within `Lemmatizer` to provide the config for particular modes and to load the lookups from a config. * Add pipe and score to lemmatizer * Simplify Tagger.score * Add missing import * Clean up imports and auto-format * Remove unused kwarg * Tidy up and auto-format * Update docstrings for Lemmatizer Update docstrings for Lemmatizer. Additionally modify `is_base_form` API to take `Token` instead of individual features. * Update docstrings * Remove tag map values from Tagger.add_label * Update API docs * Fix relative link in Lemmatizer API docs
2025-07-15 10:42:34 +03:00 · 2020-08-07 15:27:13 +02:00 · 2020-08-07 15:27:13 +02:00 · e962784531
commit e962784531
parent 1d01d89b79
59 changed files with 1439 additions and 1609 deletions
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -19,9 +19,6 @@ after_pipeline_creation = null
 [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [components]
 # Training hyper-parameters and additional features.
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -510,7 +510,7 @@ class Errors:
    E952 = ("The section '{name}' is not a valid section in the provided config.")
    E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
    E954 = ("The Tok2Vec listener did not receive a valid input.")
-    E955 = ("Can't find table '{table}' for language '{lang}' in spacy-lookups-data.")
+    E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
    E956 = ("Can't find component '{name}' in [components] block in the config. "
            "Available components: {opts}")
    E957 = ("Writing directly to Language.factories isn't needed anymore in "
@ -633,6 +633,11 @@ class Errors:
    E1001 = ("Target token outside of matched span for match with tokens "
             "'{span}' and offset '{index}' matched by patterns '{patterns}'.")
    E1002 = ("Span index out of range.")
    E1003 = ("Unsupported lemmatizer mode '{mode}'.")
    E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
             "Required tables '{tables}', found '{found}'. If you are not "
             "providing custom lookups, make sure you have the package "
             "spacy-lookups-data installed.")
@add_codes
--- a/spacy/lang/el/init.py
+++ b/spacy/lang/el/init.py
@ -1,38 +1,17 @@
-from typing import Callable
+from typing import Optional
-from thinc.api import Config
+from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import GreekLemmatizer
 from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
-from ...lookups import load_lookups
+from .lemmatizer import GreekLemmatizer
 from ...lookups import Lookups
 from ...language import Language
 from ...util import registry
 DEFAULT_CONFIG = """
 [nlp]
 [nlp.lemmatizer]
@lemmatizers = "spacy.el.GreekLemmatizer"
 """
@registry.lemmatizers("spacy.el.GreekLemmatizer")
 def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]:
    tables = ["lemma_index", "lemma_exc", "lemma_rules"]
    def lemmatizer_factory(nlp: Language) -> GreekLemmatizer:
        lookups = load_lookups(lang=nlp.lang, tables=tables)
        return GreekLemmatizer(lookups=lookups)
    return lemmatizer_factory
 class GreekDefaults(Language.Defaults):
    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
@ -47,4 +26,22 @@ class Greek(Language):
    Defaults = GreekDefaults
@Greek.factory(
    "lemmatizer",
    assigns=["token.lemma"],
    default_config={"model": None, "mode": "rule", "lookups": None},
    scores=["lemma_acc"],
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
    return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Greek"]
--- a/spacy/lang/el/lemmatizer.py
+++ b/spacy/lang/el/lemmatizer.py
@ -1,6 +1,7 @@
-from typing import Dict, List
+from typing import List
-from ...lemmatizer import Lemmatizer
+from ...pipeline import Lemmatizer
 from ...tokens import Token
 class GreekLemmatizer(Lemmatizer):
@ -14,13 +15,27 @@ class GreekLemmatizer(Lemmatizer):
    not applicable for Greek language.
    """
-    def lemmatize(
+    def rule_lemmatize(self, token: Token) -> List[str]:
-        self,
+        """Lemmatize using a rule-based approach.
-        string: str,
+
-        index: Dict[str, List[str]],
+        token (Token): The token to lemmatize.
-        exceptions: Dict[str, Dict[str, List[str]]],
+        RETURNS (list): The available lemmas for the string.
-        rules: Dict[str, List[List[str]]],
+        """
-    ) -> List[str]:
+        cache_key = (token.lower, token.pos)
        if cache_key in self.cache:
            return self.cache[cache_key]
        string = token.text
        univ_pos = token.pos_.lower()
        if univ_pos in ("", "eol", "space"):
            return [string.lower()]
        index_table = self.lookups.get_table("lemma_index", {})
        exc_table = self.lookups.get_table("lemma_exc", {})
        rules_table = self.lookups.get_table("lemma_rules", {})
        index = index_table.get(univ_pos, {})
        exceptions = exc_table.get(univ_pos, {})
        rules = rules_table.get(univ_pos, {})
        string = string.lower()
        forms = []
        if string in index:
@ -42,4 +57,6 @@ class GreekLemmatizer(Lemmatizer):
            forms.extend(oov_forms)
        if not forms:
            forms.append(string)
-        return list(set(forms))
+        forms = list(set(forms))
        self.cache[cache_key] = forms
        return forms
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -1,39 +1,18 @@
-from typing import Callable
+from typing import Optional
-from thinc.api import Config
+
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .lemmatizer import is_base_form
 from .punctuation import TOKENIZER_INFIXES
 from .lemmatizer import EnglishLemmatizer
 from ...language import Language
-from ...lemmatizer import Lemmatizer
+from ...lookups import Lookups
 from ...lookups import load_lookups
 from ...util import registry
 DEFAULT_CONFIG = """
 [nlp]
 [nlp.lemmatizer]
@lemmatizers = "spacy.en.EnglishLemmatizer"
 """
@registry.lemmatizers("spacy.en.EnglishLemmatizer")
 def create_lemmatizer() -> Callable[[Language], Lemmatizer]:
    tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
    def lemmatizer_factory(nlp: Language) -> Lemmatizer:
        lookups = load_lookups(lang=nlp.lang, tables=tables)
        return Lemmatizer(lookups=lookups, is_base_form=is_base_form)
    return lemmatizer_factory
 class EnglishDefaults(Language.Defaults):
    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    infixes = TOKENIZER_INFIXES
    lex_attr_getters = LEX_ATTRS
@ -46,4 +25,22 @@ class English(Language):
    Defaults = EnglishDefaults
@English.factory(
    "lemmatizer",
    assigns=["token.lemma"],
    default_config={"model": None, "mode": "rule", "lookups": None},
    scores=["lemma_acc"],
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
    return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["English"]
--- a/spacy/lang/en/lemmatizer.py
+++ b/spacy/lang/en/lemmatizer.py
@ -1,36 +1,43 @@
 from typing import Optional
 from ...pipeline import Lemmatizer
 from ...tokens import Token
 def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
    """
    Check whether we're dealing with an uninflected paradigm, so we can
    avoid lemmatization entirely.
-    univ_pos (unicode / int): The token's universal part-of-speech tag.
+class EnglishLemmatizer(Lemmatizer):
-    morphology (dict): The token's morphological features following the
+    """English lemmatizer. Only overrides is_base_form.
        Universal Dependencies scheme.
    """
-    if morphology is None:
+
-        morphology = {}
+    def is_base_form(self, token: Token) -> bool:
-    if univ_pos == "noun" and morphology.get("Number") == "sing":
+        """
-        return True
+        Check whether we're dealing with an uninflected paradigm, so we can
-    elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
+        avoid lemmatization entirely.
-        return True
+
-    # This maps 'VBP' to base form -- probably just need 'IS_BASE'
+        univ_pos (unicode / int): The token's universal part-of-speech tag.
-    # morphology
+        morphology (dict): The token's morphological features following the
-    elif univ_pos == "verb" and (
+            Universal Dependencies scheme.
-        morphology.get("VerbForm") == "fin"
+        """
-        and morphology.get("Tense") == "pres"
+        univ_pos = token.pos_.lower()
-        and morphology.get("Number") is None
+        morphology = token.morph.to_dict()
-    ):
+        if univ_pos == "noun" and morphology.get("Number") == "Sing":
-        return True
+            return True
-    elif univ_pos == "adj" and morphology.get("Degree") == "pos":
+        elif univ_pos == "verb" and morphology.get("VerbForm") == "Inf":
-        return True
+            return True
-    elif morphology.get("VerbForm") == "inf":
+        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
-        return True
+        # morphology
-    elif morphology.get("VerbForm") == "none":
+        elif univ_pos == "verb" and (
-        return True
+            morphology.get("VerbForm") == "Fin"
-    elif morphology.get("Degree") == "pos":
+            and morphology.get("Tense") == "Pres"
-        return True
+            and morphology.get("Number") is None
-    else:
+        ):
-        return False
+            return True
        elif univ_pos == "adj" and morphology.get("Degree") == "Pos":
            return True
        elif morphology.get("VerbForm") == "Inf":
            return True
        elif morphology.get("VerbForm") == "None":
            return True
        elif morphology.get("Degree") == "Pos":
            return True
        else:
            return False
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -1,5 +1,6 @@
-from typing import Callable
+from typing import Optional
-from thinc.api import Config
+
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
@ -7,33 +8,12 @@ from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
-from .lemmatizer import FrenchLemmatizer, is_base_form
+from .lemmatizer import FrenchLemmatizer
-from ...lookups import load_lookups
+from ...lookups import Lookups
 from ...language import Language
 from ...util import registry
 DEFAULT_CONFIG = """
 [nlp]
 [nlp.lemmatizer]
@lemmatizers = "spacy.fr.FrenchLemmatizer"
 """
@registry.lemmatizers("spacy.fr.FrenchLemmatizer")
 def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]:
    tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
    def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer:
        lookups = load_lookups(lang=nlp.lang, tables=tables)
        return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form)
    return lemmatizer_factory
 class FrenchDefaults(Language.Defaults):
    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
@ -49,4 +29,22 @@ class French(Language):
    Defaults = FrenchDefaults
@French.factory(
    "lemmatizer",
    assigns=["token.lemma"],
    default_config={"model": None, "mode": "rule", "lookups": None},
    scores=["lemma_acc"],
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
    return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["French"]
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@ -1,8 +1,7 @@
-from typing import Optional, List, Dict
+from typing import List, Dict
-from ...lemmatizer import Lemmatizer
+from ...pipeline import Lemmatizer
-from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
+from ...tokens import Token
 from ...symbols import SCONJ, CCONJ
 class FrenchLemmatizer(Lemmatizer):
@ -15,65 +14,55 @@ class FrenchLemmatizer(Lemmatizer):
    the lookup table.
    """
-    def __call__(
+    @classmethod
-        self, string: str, univ_pos: str, morphology: Optional[dict] = None
+    def get_lookups_config(cls, mode: str) -> Dict:
-    ) -> List[str]:
+        if mode == "rule":
-        lookup_table = self.lookups.get_table("lemma_lookup", {})
+            return {
-        if "lemma_rules" not in self.lookups:
+                "required_tables": [
-            return [lookup_table.get(string, string)]
+                    "lemma_lookup",
-        if univ_pos in (NOUN, "NOUN", "noun"):
+                    "lemma_rules",
-            univ_pos = "noun"
+                    "lemma_exc",
-        elif univ_pos in (VERB, "VERB", "verb"):
+                    "lemma_index",
-            univ_pos = "verb"
+                ],
-        elif univ_pos in (ADJ, "ADJ", "adj"):
+                "optional_tables": [],
-            univ_pos = "adj"
+            }
        elif univ_pos in (ADP, "ADP", "adp"):
            univ_pos = "adp"
        elif univ_pos in (ADV, "ADV", "adv"):
            univ_pos = "adv"
        elif univ_pos in (AUX, "AUX", "aux"):
            univ_pos = "aux"
        elif univ_pos in (CCONJ, "CCONJ", "cconj"):
            univ_pos = "cconj"
        elif univ_pos in (DET, "DET", "det"):
            univ_pos = "det"
        elif univ_pos in (PRON, "PRON", "pron"):
            univ_pos = "pron"
        elif univ_pos in (PUNCT, "PUNCT", "punct"):
            univ_pos = "punct"
        elif univ_pos in (SCONJ, "SCONJ", "sconj"):
            univ_pos = "sconj"
        else:
-            return [self.lookup(string)]
+            return super().get_lookups_config(mode)
    def rule_lemmatize(self, token: Token) -> List[str]:
        cache_key = (token.orth, token.pos)
        if cache_key in self.cache:
            return self.cache[cache_key]
        string = token.text
        univ_pos = token.pos_.lower()
        if univ_pos in ("", "eol", "space"):
            return [string.lower()]
        elif "lemma_rules" not in self.lookups or univ_pos not in (
            "noun",
            "verb",
            "adj",
            "adp",
            "adv",
            "aux",
            "cconj",
            "det",
            "pron",
            "punct",
            "sconj",
        ):
            return self.lookup_lemmatize(token)
        index_table = self.lookups.get_table("lemma_index", {})
        exc_table = self.lookups.get_table("lemma_exc", {})
        rules_table = self.lookups.get_table("lemma_rules", {})
        lemmas = self.lemmatize(
            string,
            index_table.get(univ_pos, {}),
            exc_table.get(univ_pos, {}),
            rules_table.get(univ_pos, []),
        )
        return lemmas
    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        if orth is not None and orth in lookup_table:
            return lookup_table[orth][0]
        return string
    def lemmatize(
        self,
        string: str,
        index: Dict[str, List[str]],
        exceptions: Dict[str, Dict[str, List[str]]],
        rules: Dict[str, List[List[str]]],
    ) -> List[str]:
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        index = index_table.get(univ_pos, {})
        exceptions = exc_table.get(univ_pos, {})
        rules = rules_table.get(univ_pos, [])
        string = string.lower()
        forms = []
        if string in index:
            forms.append(string)
            self.cache[cache_key] = forms
            return forms
        forms.extend(exceptions.get(string, []))
        oov_forms = []
@ -90,45 +79,9 @@ class FrenchLemmatizer(Lemmatizer):
        if not forms:
            forms.extend(oov_forms)
        if not forms and string in lookup_table.keys():
-            forms.append(lookup_table[string][0])
+            forms.append(self.lookup_lemmatize(token)[0])
        if not forms:
            forms.append(string)
-        return list(set(forms))
+        forms = list(set(forms))
-
+        self.cache[cache_key] = forms
-
+        return forms
 def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
    """
    Check whether we're dealing with an uninflected paradigm, so we can
    avoid lemmatization entirely.
    """
    morphology = {} if morphology is None else morphology
    others = [
        key
        for key in morphology
        if key not in (POS, "Number", "POS", "VerbForm", "Tense")
    ]
    if univ_pos == "noun" and morphology.get("Number") == "sing":
        return True
    elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
        return True
    # This maps 'VBP' to base form -- probably just need 'IS_BASE'
    # morphology
    elif univ_pos == "verb" and (
        morphology.get("VerbForm") == "fin"
        and morphology.get("Tense") == "pres"
        and morphology.get("Number") is None
        and not others
    ):
        return True
    elif univ_pos == "adj" and morphology.get("Degree") == "pos":
        return True
    elif "VerbForm=inf" in morphology:
        return True
    elif "VerbForm=none" in morphology:
        return True
    elif "Number=sing" in morphology:
        return True
    elif "Degree=pos" in morphology:
        return True
    else:
        return False
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -38,8 +38,6 @@ def create_tokenizer(split_mode: Optional[str] = None):
 class JapaneseTokenizer(DummyTokenizer):
    def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
        self.vocab = nlp.vocab
        # TODO: is this the right way to do it?
        self.vocab.morphology.load_tag_map(TAG_MAP)
        self.split_mode = split_mode
        self.tokenizer = try_sudachi_import(self.split_mode)
--- a/spacy/lang/ko/init.py
+++ b/spacy/lang/ko/init.py
@ -7,6 +7,7 @@ from .lex_attrs import LEX_ATTRS
 from ...language import Language
 from ...tokens import Doc
 from ...compat import copy_reg
 from ...symbols import POS
 from ...util import DummyTokenizer, registry
@ -29,8 +30,6 @@ def create_tokenizer():
 class KoreanTokenizer(DummyTokenizer):
    def __init__(self, nlp: Optional[Language] = None):
        self.vocab = nlp.vocab
        # TODO: is this the right way to do it?
        self.vocab.morphology.load_tag_map(TAG_MAP)
        MeCab = try_mecab_import()
        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
@ -44,6 +43,7 @@ class KoreanTokenizer(DummyTokenizer):
        for token, dtoken in zip(doc, dtokens):
            first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
            token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
            token.pos = TAG_MAP[token.tag_][POS]
            token.lemma_ = dtoken["lemma"]
        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
        return doc
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -1,5 +1,6 @@
-from typing import Callable
+from typing import Optional
-from thinc.api import Config
+
 from thinc.api import Model
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -7,32 +8,11 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .lemmatizer import DutchLemmatizer
-from ...lookups import load_lookups
+from ...lookups import Lookups
 from ...language import Language
 from ...util import registry
 DEFAULT_CONFIG = """
 [nlp]
 [nlp.lemmatizer]
@lemmatizers = "spacy.nl.DutchLemmatizer"
 """
@registry.lemmatizers("spacy.nl.DutchLemmatizer")
 def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]:
    tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
    def lemmatizer_factory(nlp: Language) -> DutchLemmatizer:
        lookups = load_lookups(lang=nlp.lang, tables=tables)
        return DutchLemmatizer(lookups=lookups)
    return lemmatizer_factory
 class DutchDefaults(Language.Defaults):
    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
@ -46,4 +26,22 @@ class Dutch(Language):
    Defaults = DutchDefaults
@Dutch.factory(
    "lemmatizer",
    assigns=["token.lemma"],
    default_config={"model": None, "mode": "rule", "lookups": None},
    scores=["lemma_acc"],
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
    return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Dutch"]
--- a/spacy/lang/nl/lemmatizer.py
+++ b/spacy/lang/nl/lemmatizer.py
@ -1,44 +1,34 @@
-from typing import Optional, List, Dict, Tuple
+from typing import List, Dict
-from ...lemmatizer import Lemmatizer
+from ...pipeline import Lemmatizer
-from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
+from ...tokens import Token
 class DutchLemmatizer(Lemmatizer):
-    # Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
+    @classmethod
-    univ_pos_name_variants = {
+    def get_lookups_config(cls, mode: str) -> Dict:
-        NOUN: "noun",
+        if mode == "rule":
-        "NOUN": "noun",
+            return {
-        "noun": "noun",
+                "required_tables": [
-        VERB: "verb",
+                    "lemma_lookup",
-        "VERB": "verb",
+                    "lemma_rules",
-        "verb": "verb",
+                    "lemma_exc",
-        AUX: "verb",
+                    "lemma_index",
-        "AUX": "verb",
+                ],
-        "aux": "verb",
+            }
-        ADJ: "adj",
+        else:
-        "ADJ": "adj",
+            return super().get_lookups_config(mode)
        "adj": "adj",
        ADV: "adv",
        "ADV": "adv",
        "adv": "adv",
        PRON: "pron",
        "PRON": "pron",
        "pron": "pron",
        DET: "det",
        "DET": "det",
        "det": "det",
        ADP: "adp",
        "ADP": "adp",
        "adp": "adp",
        NUM: "num",
        "NUM": "num",
        "num": "num",
    }
-    def __call__(
+    def lookup_lemmatize(self, token: Token) -> List[str]:
-        self, string: str, univ_pos: str, morphology: Optional[dict] = None
+        """Overrides parent method so that a lowercased version of the string
-    ) -> List[str]:
+        is used to search the lookup table. This is necessary because our
        lookup table consists entirely of lowercase keys."""
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        string = token.text.lower()
        return [lookup_table.get(string, string)]
    # Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
    def rule_lemmatize(self, token: Token) -> List[str]:
        # Difference 1: self.rules is assumed to be non-None, so no
        # 'is None' check required.
        # String lowercased from the get-go. All lemmatization results in
@ -46,74 +36,61 @@ class DutchLemmatizer(Lemmatizer):
        # any problems, and it keeps the exceptions indexes small. If this
        # creates problems for proper nouns, we can introduce a check for
        # univ_pos == "PROPN".
-        string = string.lower()
+        cache_key = (token.lower, token.pos)
-        try:
+        if cache_key in self.cache:
-            univ_pos = self.univ_pos_name_variants[univ_pos]
+            return self.cache[cache_key]
-        except KeyError:
+        string = token.text
-            # Because PROPN not in self.univ_pos_name_variants, proper names
+        univ_pos = token.pos_.lower()
-            # are not lemmatized. They are lowercased, however.
+        if univ_pos in ("", "eol", "space"):
-            return [string]
+            forms = [string.lower()]
-            # if string in self.lemma_index.get(univ_pos)
+            self.cache[cache_key] = forms
            return forms
        index_table = self.lookups.get_table("lemma_index", {})
        exc_table = self.lookups.get_table("lemma_exc", {})
        rules_table = self.lookups.get_table("lemma_rules", {})
        index = index_table.get(univ_pos, {})
        exceptions = exc_table.get(univ_pos, {})
        rules = rules_table.get(univ_pos, {})
        string = string.lower()
        if univ_pos not in (
            "noun",
            "verb",
            "aux",
            "adj",
            "adv",
            "pron",
            "det",
            "adp",
            "num",
        ):
            forms = [string]
            self.cache[cache_key] = forms
            return forms
        lemma_index = index_table.get(univ_pos, {})
        # string is already lemma
        if string in lemma_index:
-            return [string]
+            forms = [string]
            self.cache[cache_key] = forms
            return forms
        exc_table = self.lookups.get_table("lemma_exc", {})
        exceptions = exc_table.get(univ_pos, {})
        # string is irregular token contained in exceptions index.
        try:
-            lemma = exceptions[string]
+            forms = [exceptions[string][0]]
-            return [lemma[0]]
+            self.cache[cache_key] = forms
            return forms
        except KeyError:
            pass
        # string corresponds to key in lookup table
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        looked_up_lemma = lookup_table.get(string)
        if looked_up_lemma and looked_up_lemma in lemma_index:
-            return [looked_up_lemma]
+            forms = [looked_up_lemma]
            self.cache[cache_key] = forms
            return forms
        rules_table = self.lookups.get_table("lemma_rules", {})
        forms, is_known = self.lemmatize(
            string, lemma_index, exceptions, rules_table.get(univ_pos, [])
        )
        # Back-off through remaining return value candidates.
        if forms:
            if is_known:
                return forms
            else:
                for form in forms:
                    if form in exceptions:
                        return [form]
            if looked_up_lemma:
                return [looked_up_lemma]
            else:
                return forms
        elif looked_up_lemma:
            return [looked_up_lemma]
        else:
            return [string]
    # Overrides parent method so that a lowercased version of the string is
    # used to search the lookup table. This is necessary because our lookup
    # table consists entirely of lowercase keys.
    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        string = string.lower()
        if orth is not None:
            return lookup_table.get(orth, string)
        else:
            return lookup_table.get(string, string)
    # Reimplemented to focus more on application of suffix rules and to return
    # as early as possible.
    def lemmatize(
        self,
        string: str,
        index: Dict[str, List[str]],
        exceptions: Dict[str, Dict[str, List[str]]],
        rules: Dict[str, List[List[str]]],
    ) -> Tuple[List[str], bool]:
        # returns (forms, is_known: bool)
        oov_forms = []
        for old, new in rules:
            if string.endswith(old):
@ -121,7 +98,31 @@ class DutchLemmatizer(Lemmatizer):
                if not form:
                    pass
                elif form in index:
-                    return [form], True  # True = Is known (is lemma)
+                    forms = [form]
                    self.cache[cache_key] = forms
                    return forms
                else:
                    oov_forms.append(form)
-        return list(set(oov_forms)), False
+        forms = list(set(oov_forms))
        # Back-off through remaining return value candidates.
        if forms:
            for form in forms:
                if form in exceptions:
                    forms = [form]
                    self.cache[cache_key] = forms
                    return forms
            if looked_up_lemma:
                forms = [looked_up_lemma]
                self.cache[cache_key] = forms
                return forms
            else:
                self.cache[cache_key] = forms
                return forms
        elif looked_up_lemma:
            forms = [looked_up_lemma]
            self.cache[cache_key] = forms
            return forms
        else:
            forms = [string]
            self.cache[cache_key] = forms
            return forms
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -1,5 +1,6 @@
-from typing import Callable
+from typing import Optional
-from thinc.api import Config
+
 from thinc.api import Model
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
@ -7,42 +8,16 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import PolishLemmatizer
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...lookups import load_lookups
+from ...lookups import Lookups
 from ...language import Language
 from ...util import registry
 DEFAULT_CONFIG = """
 [nlp]
 [nlp.lemmatizer]
@lemmatizers = "spacy.pl.PolishLemmatizer"
 """
 TOKENIZER_EXCEPTIONS = {
    exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
 }
@registry.lemmatizers("spacy.pl.PolishLemmatizer")
 def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]:
    # fmt: off
    tables = [
        "lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv",
        "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num",
        "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"
    ]
    # fmt: on
    def lemmatizer_factory(nlp: Language) -> PolishLemmatizer:
        lookups = load_lookups(lang=nlp.lang, tables=tables)
        return PolishLemmatizer(lookups=lookups)
    return lemmatizer_factory
 class PolishDefaults(Language.Defaults):
    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
@ -56,4 +31,22 @@ class Polish(Language):
    Defaults = PolishDefaults
@Polish.factory(
    "lemmatizer",
    assigns=["token.lemma"],
    default_config={"model": None, "mode": "lookup", "lookups": None},
    scores=["lemma_acc"],
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
    return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Polish"]
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@ -1,7 +1,7 @@
-from typing import Optional, List, Dict
+from typing import List, Dict
-from ...lemmatizer import Lemmatizer
+from ...pipeline import Lemmatizer
-from ...parts_of_speech import NAMES
+from ...tokens import Token
 class PolishLemmatizer(Lemmatizer):
@ -9,12 +9,30 @@ class PolishLemmatizer(Lemmatizer):
    # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
    # It utilizes some prefix based improvements for verb and adjectives
    # lemmatization, as well as case-sensitive lemmatization for nouns.
-    def __call__(
+
-        self, string: str, univ_pos: str, morphology: Optional[dict] = None
+    @classmethod
-    ) -> List[str]:
+    def get_lookups_config(cls, mode: str) -> Dict:
-        if isinstance(univ_pos, int):
+        if mode == "lookup":
-            univ_pos = NAMES.get(univ_pos, "X")
+            return {
-        univ_pos = univ_pos.upper()
+                "required_tables": [
                    "lemma_lookup_adj",
                    "lemma_lookup_adp",
                    "lemma_lookup_adv",
                    "lemma_lookup_aux",
                    "lemma_lookup_noun",
                    "lemma_lookup_num",
                    "lemma_lookup_part",
                    "lemma_lookup_pron",
                    "lemma_lookup_verb",
                ]
            }
        else:
            return super().get_lookups_config(mode)
    def lookup_lemmatize(self, token: Token) -> List[str]:
        string = token.text
        univ_pos = token.pos_
        morphology = token.morph.to_dict()
        lookup_pos = univ_pos.lower()
        if univ_pos == "PROPN":
            lookup_pos = "noun"
@ -71,15 +89,3 @@ class PolishLemmatizer(Lemmatizer):
                return [lookup_table[string]]
            return [string.lower()]
        return [lookup_table.get(string, string)]
    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        return string.lower()
    def lemmatize(
        self,
        string: str,
        index: Dict[str, List[str]],
        exceptions: Dict[str, Dict[str, List[str]]],
        rules: Dict[str, List[List[str]]],
    ) -> List[str]:
        raise NotImplementedError
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -1,32 +1,16 @@
-from typing import Callable
+from typing import Optional
-from thinc.api import Config
+
 from thinc.api import Model
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import RussianLemmatizer
 from ...util import registry
 from ...language import Language
-
+from ...lookups import Lookups
 DEFAULT_CONFIG = """
 [nlp]
 [nlp.lemmatizer]
@lemmatizers = "spacy.ru.RussianLemmatizer"
 """
@registry.lemmatizers("spacy.ru.RussianLemmatizer")
 def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]:
    def lemmatizer_factory(nlp: Language) -> RussianLemmatizer:
        return RussianLemmatizer()
    return lemmatizer_factory
 class RussianDefaults(Language.Defaults):
    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
@ -37,4 +21,21 @@ class Russian(Language):
    Defaults = RussianDefaults
@Russian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
    default_config={"model": None, "mode": "pymorphy2", "lookups": None},
    scores=["lemma_acc"],
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Russian"]
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -1,8 +1,12 @@
-from typing import Optional, Tuple, Dict, List
+from typing import Optional, List, Dict, Tuple
 from thinc.api import Model
 from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
 from ...lemmatizer import Lemmatizer
 from ...lookups import Lookups
 from ...pipeline import Lemmatizer
 from ...symbols import POS
 from ...tokens import Token
 from ...vocab import Vocab
 PUNCT_RULES = {"«": '"', "»": '"'}
@ -11,8 +15,17 @@ PUNCT_RULES = {"«": '"', "»": '"'}
 class RussianLemmatizer(Lemmatizer):
    _morph = None
-    def __init__(self, lookups: Optional[Lookups] = None) -> None:
+    def __init__(
-        super(RussianLemmatizer, self).__init__(lookups)
+        self,
        vocab: Vocab,
        model: Optional[Model],
        name: str = "lemmatizer",
        *,
        mode: str = "pymorphy2",
        lookups: Optional[Lookups] = None,
    ) -> None:
        super().__init__(vocab, model, name, mode=mode, lookups=lookups)
        try:
            from pymorphy2 import MorphAnalyzer
        except ImportError:
@ -25,10 +38,10 @@ class RussianLemmatizer(Lemmatizer):
        if RussianLemmatizer._morph is None:
            RussianLemmatizer._morph = MorphAnalyzer()
-    def __call__(
+    def pymorphy2_lemmatize(self, token: Token) -> List[str]:
-        self, string: str, univ_pos: str, morphology: Optional[dict] = None
+        string = token.text
-    ) -> List[str]:
+        univ_pos = token.pos_
-        univ_pos = self.normalize_univ_pos(univ_pos)
+        morphology = token.morph.to_dict()
        if univ_pos == "PUNCT":
            return [PUNCT_RULES.get(string, string)]
        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
@ -81,25 +94,8 @@ class RussianLemmatizer(Lemmatizer):
            return [string.lower()]
        return list(set([analysis.normal_form for analysis in filtered_analyses]))
-    @staticmethod
+    def lookup_lemmatize(self, token: Token) -> List[str]:
-    def normalize_univ_pos(univ_pos: str) -> Optional[str]:
+        string = token.text
        if isinstance(univ_pos, str):
            return univ_pos.upper()
        symbols_to_str = {
            ADJ: "ADJ",
            DET: "DET",
            NOUN: "NOUN",
            NUM: "NUM",
            PRON: "PRON",
            PROPN: "PROPN",
            PUNCT: "PUNCT",
            VERB: "VERB",
        }
        if univ_pos in symbols_to_str:
            return symbols_to_str[univ_pos]
        return None
    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        analyses = self._morph.parse(string)
        if len(analyses) == 1:
            return analyses[0].normal_form
--- a/spacy/lang/uk/init.py
+++ b/spacy/lang/uk/init.py
@ -1,32 +1,16 @@
-from typing import Callable
+from typing import Optional
-from thinc.api import Config
+
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...util import registry
 from ...language import Language
 from .lemmatizer import UkrainianLemmatizer
-
+from ...language import Language
-
+from ...lookups import Lookups
 DEFAULT_CONFIG = """
 [nlp]
 [nlp.lemmatizer]
@lemmatizers = "spacy.uk.UkrainianLemmatizer"
 """
@registry.lemmatizers("spacy.uk.UkrainianLemmatizer")
 def create_ukrainian_lemmatizer() -> Callable[[Language], UkrainianLemmatizer]:
    def lemmatizer_factory(nlp: Language) -> UkrainianLemmatizer:
        return UkrainianLemmatizer()
    return lemmatizer_factory
 class UkrainianDefaults(Language.Defaults):
    config = Config().from_str(DEFAULT_CONFIG)
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
@ -37,4 +21,21 @@ class Ukrainian(Language):
    Defaults = UkrainianDefaults
@Ukrainian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
    default_config={"model": None, "mode": "pymorphy2", "lookups": None},
    scores=["lemma_acc"],
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Ukrainian"]
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -1,187 +1,30 @@
-from typing import Optional, List, Tuple, Dict
+from typing import Optional
-from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
+from thinc.api import Model
 from ..ru.lemmatizer import RussianLemmatizer
 from ...lookups import Lookups
-from ...lemmatizer import Lemmatizer
+from ...vocab import Vocab
-PUNCT_RULES = {"«": '"', "»": '"'}
+class UkrainianLemmatizer(RussianLemmatizer):
-
+    def __init__(
-
+        self,
-class UkrainianLemmatizer(Lemmatizer):
+        vocab: Vocab,
-    _morph = None
+        model: Optional[Model],
-
+        name: str = "lemmatizer",
-    def __init__(self, lookups: Optional[Lookups] = None) -> None:
+        *,
-        super(UkrainianLemmatizer, self).__init__(lookups)
+        mode: str = "pymorphy2",
        lookups: Optional[Lookups] = None,
    ) -> None:
        super().__init__(vocab, model, name, mode=mode, lookups=lookups)
        try:
            from pymorphy2 import MorphAnalyzer
-
+        except ImportError:
            if UkrainianLemmatizer._morph is None:
                UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
        except (ImportError, TypeError):
            raise ImportError(
                "The Ukrainian lemmatizer requires the pymorphy2 library and "
                'dictionaries: try to fix it with "pip uninstall pymorphy2" and'
                '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
            ) from None
-
+        if UkrainianLemmatizer._morph is None:
-    def __call__(
+            UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
    ) -> List[str]:
        univ_pos = self.normalize_univ_pos(univ_pos)
        if univ_pos == "PUNCT":
            return [PUNCT_RULES.get(string, string)]
        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
            # Skip unchangeable pos
            return [string.lower()]
        analyses = self._morph.parse(string)
        filtered_analyses = []
        for analysis in analyses:
            if not analysis.is_known:
                # Skip suggested parse variant for unknown word for pymorphy
                continue
            analysis_pos, _ = oc2ud(str(analysis.tag))
            if analysis_pos == univ_pos or (
                analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
            ):
                filtered_analyses.append(analysis)
        if not len(filtered_analyses):
            return [string.lower()]
        if morphology is None or (len(morphology) == 1 and POS in morphology):
            return list(set([analysis.normal_form for analysis in filtered_analyses]))
        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
            features_to_compare = ["Case", "Number", "Gender"]
        elif univ_pos == "NUM":
            features_to_compare = ["Case", "Gender"]
        elif univ_pos == "PRON":
            features_to_compare = ["Case", "Number", "Gender", "Person"]
        else:  # VERB
            features_to_compare = [
                "Aspect",
                "Gender",
                "Mood",
                "Number",
                "Tense",
                "VerbForm",
                "Voice",
            ]
        analyses, filtered_analyses = filtered_analyses, []
        for analysis in analyses:
            _, analysis_morph = oc2ud(str(analysis.tag))
            for feature in features_to_compare:
                if (
                    feature in morphology
                    and feature in analysis_morph
                    and morphology[feature].lower() != analysis_morph[feature].lower()
                ):
                    break
            else:
                filtered_analyses.append(analysis)
        if not len(filtered_analyses):
            return [string.lower()]
        return list(set([analysis.normal_form for analysis in filtered_analyses]))
    @staticmethod
    def normalize_univ_pos(univ_pos: str) -> Optional[str]:
        if isinstance(univ_pos, str):
            return univ_pos.upper()
        symbols_to_str = {
            ADJ: "ADJ",
            DET: "DET",
            NOUN: "NOUN",
            NUM: "NUM",
            PRON: "PRON",
            PROPN: "PROPN",
            PUNCT: "PUNCT",
            VERB: "VERB",
        }
        if univ_pos in symbols_to_str:
            return symbols_to_str[univ_pos]
        return None
    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        analyses = self._morph.parse(string)
        if len(analyses) == 1:
            return analyses[0].normal_form
        return string
 def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
    gram_map = {
        "_POS": {
            "ADJF": "ADJ",
            "ADJS": "ADJ",
            "ADVB": "ADV",
            "Apro": "DET",
            "COMP": "ADJ",  # Can also be an ADV - unchangeable
            "CONJ": "CCONJ",  # Can also be a SCONJ - both unchangeable ones
            "GRND": "VERB",
            "INFN": "VERB",
            "INTJ": "INTJ",
            "NOUN": "NOUN",
            "NPRO": "PRON",
            "NUMR": "NUM",
            "NUMB": "NUM",
            "PNCT": "PUNCT",
            "PRCL": "PART",
            "PREP": "ADP",
            "PRTF": "VERB",
            "PRTS": "VERB",
            "VERB": "VERB",
        },
        "Animacy": {"anim": "Anim", "inan": "Inan"},
        "Aspect": {"impf": "Imp", "perf": "Perf"},
        "Case": {
            "ablt": "Ins",
            "accs": "Acc",
            "datv": "Dat",
            "gen1": "Gen",
            "gen2": "Gen",
            "gent": "Gen",
            "loc2": "Loc",
            "loct": "Loc",
            "nomn": "Nom",
            "voct": "Voc",
        },
        "Degree": {"COMP": "Cmp", "Supr": "Sup"},
        "Gender": {"femn": "Fem", "masc": "Masc", "neut": "Neut"},
        "Mood": {"impr": "Imp", "indc": "Ind"},
        "Number": {"plur": "Plur", "sing": "Sing"},
        "NumForm": {"NUMB": "Digit"},
        "Person": {"1per": "1", "2per": "2", "3per": "3", "excl": "2", "incl": "1"},
        "Tense": {"futr": "Fut", "past": "Past", "pres": "Pres"},
        "Variant": {"ADJS": "Brev", "PRTS": "Brev"},
        "VerbForm": {
            "GRND": "Conv",
            "INFN": "Inf",
            "PRTF": "Part",
            "PRTS": "Part",
            "VERB": "Fin",
        },
        "Voice": {"actv": "Act", "pssv": "Pass"},
        "Abbr": {"Abbr": "Yes"},
    }
    pos = "X"
    morphology = dict()
    unmatched = set()
    grams = oc_tag.replace(" ", ",").split(",")
    for gram in grams:
        match = False
        for categ, gmap in sorted(gram_map.items()):
            if gram in gmap:
                match = True
                if categ == "_POS":
                    pos = gmap[gram]
                else:
                    morphology[categ] = gmap[gram]
        if not match:
            unmatched.add(gram)
    while len(unmatched) > 0:
        gram = unmatched.pop()
        if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
            pos = "PROPN"
        elif gram == "Auxt":
            pos = "AUX"
        elif gram == "Pltm":
            morphology["Number"] = "Ptan"
    return pos, morphology
--- a/spacy/language.py
+++ b/spacy/language.py
@ -29,7 +29,6 @@ from .lang.punctuation import TOKENIZER_INFIXES
 from .tokens import Doc
 from .lookups import load_lookups
 from .tokenizer import Tokenizer
 from .lemmatizer import Lemmatizer
 from .errors import Errors, Warnings
 from .schemas import ConfigSchema
 from .git_info import GIT_VERSION
@ -87,22 +86,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
    return tokenizer_factory
@registry.lemmatizers("spacy.Lemmatizer.v1")
 def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]:
    """Registered function to create a lemmatizer. Returns a factory that takes
    the nlp object and returns a Lemmatizer instance with data loaded in from
    spacy-lookups-data, if the package is installed.
    """
    # TODO: Will be replaced when the lemmatizer becomes a pipeline component
    tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
    def lemmatizer_factory(nlp: "Language") -> "Lemmatizer":
        lookups = load_lookups(lang=nlp.lang, tables=tables, strict=False)
        return Lemmatizer(lookups=lookups)
    return lemmatizer_factory
 class Language:
    """A text-processing pipeline. Usually you'll load this once per process,
    and pass the instance around your application.
@ -128,7 +111,6 @@ class Language:
        max_length: int = 10 ** 6,
        meta: Dict[str, Any] = {},
        create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
        create_lemmatizer: Optional[Callable[["Language"], Callable]] = None,
        **kwargs,
    ) -> None:
        """Initialise a Language object.
@ -146,8 +128,6 @@ class Language:
            100,000 characters in one text.
        create_tokenizer (Callable): Function that takes the nlp object and
            returns a tokenizer.
        create_lemmatizer (Callable): Function that takes the nlp object and
            returns a lemmatizer.
        DOCS: https://spacy.io/api/language#init
        """
@ -166,13 +146,9 @@ class Language:
        if vocab is True:
            vectors_name = meta.get("vectors", {}).get("name")
            if not create_lemmatizer:
                lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
                create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
            vocab = create_vocab(
                self.lang,
                self.Defaults,
                lemmatizer=create_lemmatizer(self),
                vectors_name=vectors_name,
                load_data=self._config["nlp"]["load_vocab_data"],
            )
@ -1451,7 +1427,6 @@ class Language:
        filled["components"] = orig_pipeline
        config["components"] = orig_pipeline
        create_tokenizer = resolved["nlp"]["tokenizer"]
        create_lemmatizer = resolved["nlp"]["lemmatizer"]
        before_creation = resolved["nlp"]["before_creation"]
        after_creation = resolved["nlp"]["after_creation"]
        after_pipeline_creation = resolved["nlp"]["after_pipeline_creation"]
@ -1467,7 +1442,6 @@ class Language:
        nlp = lang_cls(
            vocab=vocab,
            create_tokenizer=create_tokenizer,
            create_lemmatizer=create_lemmatizer,
        )
        if after_creation is not None:
            nlp = after_creation(nlp)
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -1,145 +0,0 @@
 from typing import Optional, Callable, List, Dict
 from .lookups import Lookups
 from .parts_of_speech import NAMES as UPOS_NAMES
 class Lemmatizer:
    """
    The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
    lookup tables.
    DOCS: https://spacy.io/api/lemmatizer
    """
    def __init__(
        self,
        lookups: Optional[Lookups] = None,
        is_base_form: Optional[Callable] = None,
    ) -> None:
        """Initialize a Lemmatizer.
        lookups (Lookups): The lookups object containing the (optional) tables
            "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
        """
        self.lookups = lookups if lookups is not None else Lookups()
        self.is_base_form = is_base_form
    def __call__(
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
    ) -> List[str]:
        """Lemmatize a string.
        string (str): The string to lemmatize, e.g. the token text.
        univ_pos (str / int): The token's universal part-of-speech tag.
        morphology (dict): The token's morphological features following the
            Universal Dependencies scheme.
        RETURNS (list): The available lemmas for the string.
        """
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        if "lemma_rules" not in self.lookups:
            return [lookup_table.get(string, string)]
        if isinstance(univ_pos, int):
            univ_pos = UPOS_NAMES.get(univ_pos, "X")
        univ_pos = univ_pos.lower()
        if univ_pos in ("", "eol", "space"):
            return [string.lower()]
        # See Issue #435 for example of where this logic is requied.
        if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):
            return [string.lower()]
        index_table = self.lookups.get_table("lemma_index", {})
        exc_table = self.lookups.get_table("lemma_exc", {})
        rules_table = self.lookups.get_table("lemma_rules", {})
        if not any(
            (
                index_table.get(univ_pos),
                exc_table.get(univ_pos),
                rules_table.get(univ_pos),
            )
        ):
            if univ_pos == "propn":
                return [string]
            else:
                return [string.lower()]
        lemmas = self.lemmatize(
            string,
            index_table.get(univ_pos, {}),
            exc_table.get(univ_pos, {}),
            rules_table.get(univ_pos, []),
        )
        return lemmas
    def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "noun", morphology)
    def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "verb", morphology)
    def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "adj", morphology)
    def det(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "det", morphology)
    def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "pron", morphology)
    def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "adp", morphology)
    def num(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "num", morphology)
    def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "punct", morphology)
    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        """Look up a lemma in the table, if available. If no lemma is found,
        the original string is returned.
        string (str): The original string.
        orth (int): Optional hash of the string to look up. If not set, the
            string will be used and hashed.
        RETURNS (str): The lemma if the string was found, otherwise the
            original string.
        """
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        key = orth if orth is not None else string
        if key in lookup_table:
            return lookup_table[key]
        return string
    def lemmatize(
        self,
        string: str,
        index: Dict[str, List[str]],
        exceptions: Dict[str, Dict[str, List[str]]],
        rules: Dict[str, List[List[str]]],
    ) -> List[str]:
        orig = string
        string = string.lower()
        forms = []
        oov_forms = []
        for old, new in rules:
            if string.endswith(old):
                form = string[: len(string) - len(old)] + new
                if not form:
                    pass
                elif form in index or not form.isalpha():
                    forms.append(form)
                else:
                    oov_forms.append(form)
        # Remove duplicates but preserve the ordering of applied "rules"
        forms = list(dict.fromkeys(forms))
        # Put exceptions at the front of the list, so they get priority.
        # This is a dodgy heuristic -- but it's the best we can do until we get
        # frequencies on this. We can at least prune out problematic exceptions,
        # if they shadow more frequent analyses.
        for form in exceptions.get(string, []):
            if form not in forms:
                forms.insert(0, form)
        if not forms:
            forms.extend(oov_forms)
        if not forms:
            forms.append(orig)
        return forms
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -28,6 +28,8 @@ def load_lookups(
    # TODO: import spacy_lookups_data instead of going via entry points here?
    lookups = Lookups()
    if lang not in registry.lookups:
        if strict and len(tables) > 0:
            raise ValueError(Errors.E955.format(table=", ".join(tables), lang=lang))
        return lookups
    data = registry.lookups.get(lang)
    for table in tables:
@ -41,152 +43,6 @@ def load_lookups(
    return lookups
 class Lookups:
    """Container for large lookup tables and dictionaries, e.g. lemmatization
    data or tokenizer exception lists. Lookups are available via vocab.lookups,
    so they can be accessed before the pipeline components are applied (e.g.
    in the tokenizer and lemmatizer), as well as within the pipeline components
    via doc.vocab.lookups.
    """
    def __init__(self) -> None:
        """Initialize the Lookups object.
        DOCS: https://spacy.io/api/lookups#init
        """
        self._tables = {}
    def __contains__(self, name: str) -> bool:
        """Check if the lookups contain a table of a given name. Delegates to
        Lookups.has_table.
        name (str): Name of the table.
        RETURNS (bool): Whether a table of that name is in the lookups.
        """
        return self.has_table(name)
    def __len__(self) -> int:
        """RETURNS (int): The number of tables in the lookups."""
        return len(self._tables)
    @property
    def tables(self) -> List[str]:
        """RETURNS (List[str]): Names of all tables in the lookups."""
        return list(self._tables.keys())
    def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table":
        """Add a new table to the lookups. Raises an error if the table exists.
        name (str): Unique name of table.
        data (dict): Optional data to add to the table.
        RETURNS (Table): The newly added table.
        DOCS: https://spacy.io/api/lookups#add_table
        """
        if name in self.tables:
            raise ValueError(Errors.E158.format(name=name))
        table = Table(name=name, data=data)
        self._tables[name] = table
        return table
    def get_table(self, name: str, default: Any = UNSET) -> "Table":
        """Get a table. Raises an error if the table doesn't exist and no
        default value is provided.
        name (str): Name of the table.
        default (Any): Optional default value to return if table doesn't exist.
        RETURNS (Table): The table.
        DOCS: https://spacy.io/api/lookups#get_table
        """
        if name not in self._tables:
            if default == UNSET:
                raise KeyError(Errors.E159.format(name=name, tables=self.tables))
            return default
        return self._tables[name]
    def remove_table(self, name: str) -> "Table":
        """Remove a table. Raises an error if the table doesn't exist.
        name (str): Name of the table to remove.
        RETURNS (Table): The removed table.
        DOCS: https://spacy.io/api/lookups#remove_table
        """
        if name not in self._tables:
            raise KeyError(Errors.E159.format(name=name, tables=self.tables))
        return self._tables.pop(name)
    def has_table(self, name: str) -> bool:
        """Check if the lookups contain a table of a given name.
        name (str): Name of the table.
        RETURNS (bool): Whether a table of that name exists.
        DOCS: https://spacy.io/api/lookups#has_table
        """
        return name in self._tables
    def to_bytes(self, **kwargs) -> bytes:
        """Serialize the lookups to a bytestring.
        RETURNS (bytes): The serialized Lookups.
        DOCS: https://spacy.io/api/lookups#to_bytes
        """
        return srsly.msgpack_dumps(self._tables)
    def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
        """Load the lookups from a bytestring.
        bytes_data (bytes): The data to load.
        RETURNS (Lookups): The loaded Lookups.
        DOCS: https://spacy.io/api/lookups#from_bytes
        """
        self._tables = {}
        for key, value in srsly.msgpack_loads(bytes_data).items():
            self._tables[key] = Table(key, value)
        return self
    def to_disk(
        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
    ) -> None:
        """Save the lookups to a directory as lookups.bin. Expects a path to a
        directory, which will be created if it doesn't exist.
        path (str / Path): The file path.
        DOCS: https://spacy.io/api/lookups#to_disk
        """
        if len(self._tables):
            path = ensure_path(path)
            if not path.exists():
                path.mkdir()
            filepath = path / filename
            with filepath.open("wb") as file_:
                file_.write(self.to_bytes())
    def from_disk(
        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
    ) -> "Lookups":
        """Load lookups from a directory containing a lookups.bin. Will skip
        loading if the file doesn't exist.
        path (str / Path): The directory path.
        RETURNS (Lookups): The loaded lookups.
        DOCS: https://spacy.io/api/lookups#from_disk
        """
        path = ensure_path(path)
        filepath = path / filename
        if filepath.exists():
            with filepath.open("rb") as file_:
                data = file_.read()
            return self.from_bytes(data)
        return self
 class Table(OrderedDict):
    """A table in the lookups. Subclass of builtin dict that implements a
    slightly more consistent and unified API.
@ -303,3 +159,159 @@ class Table(OrderedDict):
        self.clear()
        self.update(data)
        return self
 class Lookups:
    """Container for large lookup tables and dictionaries, e.g. lemmatization
    data or tokenizer exception lists. Lookups are available via vocab.lookups,
    so they can be accessed before the pipeline components are applied (e.g.
    in the tokenizer and lemmatizer), as well as within the pipeline components
    via doc.vocab.lookups.
    """
    def __init__(self) -> None:
        """Initialize the Lookups object.
        DOCS: https://spacy.io/api/lookups#init
        """
        self._tables = {}
    def __contains__(self, name: str) -> bool:
        """Check if the lookups contain a table of a given name. Delegates to
        Lookups.has_table.
        name (str): Name of the table.
        RETURNS (bool): Whether a table of that name is in the lookups.
        """
        return self.has_table(name)
    def __len__(self) -> int:
        """RETURNS (int): The number of tables in the lookups."""
        return len(self._tables)
    @property
    def tables(self) -> List[str]:
        """RETURNS (List[str]): Names of all tables in the lookups."""
        return list(self._tables.keys())
    def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> Table:
        """Add a new table to the lookups. Raises an error if the table exists.
        name (str): Unique name of table.
        data (dict): Optional data to add to the table.
        RETURNS (Table): The newly added table.
        DOCS: https://spacy.io/api/lookups#add_table
        """
        if name in self.tables:
            raise ValueError(Errors.E158.format(name=name))
        table = Table(name=name, data=data)
        self._tables[name] = table
        return table
    def set_table(self, name: str, table: Table) -> None:
        """Set a table.
        name (str): Name of the table to set.
        table (Table): The Table to set.
        DOCS: https://spacy.io/api/lookups#set_table
        """
        self._tables[name] = table
    def get_table(self, name: str, default: Any = UNSET) -> Table:
        """Get a table. Raises an error if the table doesn't exist and no
        default value is provided.
        name (str): Name of the table.
        default (Any): Optional default value to return if table doesn't exist.
        RETURNS (Table): The table.
        DOCS: https://spacy.io/api/lookups#get_table
        """
        if name not in self._tables:
            if default == UNSET:
                raise KeyError(Errors.E159.format(name=name, tables=self.tables))
            return default
        return self._tables[name]
    def remove_table(self, name: str) -> Table:
        """Remove a table. Raises an error if the table doesn't exist.
        name (str): Name of the table to remove.
        RETURNS (Table): The removed table.
        DOCS: https://spacy.io/api/lookups#remove_table
        """
        if name not in self._tables:
            raise KeyError(Errors.E159.format(name=name, tables=self.tables))
        return self._tables.pop(name)
    def has_table(self, name: str) -> bool:
        """Check if the lookups contain a table of a given name.
        name (str): Name of the table.
        RETURNS (bool): Whether a table of that name exists.
        DOCS: https://spacy.io/api/lookups#has_table
        """
        return name in self._tables
    def to_bytes(self, **kwargs) -> bytes:
        """Serialize the lookups to a bytestring.
        RETURNS (bytes): The serialized Lookups.
        DOCS: https://spacy.io/api/lookups#to_bytes
        """
        return srsly.msgpack_dumps(self._tables)
    def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
        """Load the lookups from a bytestring.
        bytes_data (bytes): The data to load.
        RETURNS (Lookups): The loaded Lookups.
        DOCS: https://spacy.io/api/lookups#from_bytes
        """
        self._tables = {}
        for key, value in srsly.msgpack_loads(bytes_data).items():
            self._tables[key] = Table(key, value)
        return self
    def to_disk(
        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
    ) -> None:
        """Save the lookups to a directory as lookups.bin. Expects a path to a
        directory, which will be created if it doesn't exist.
        path (str / Path): The file path.
        DOCS: https://spacy.io/api/lookups#to_disk
        """
        if len(self._tables):
            path = ensure_path(path)
            if not path.exists():
                path.mkdir()
            filepath = path / filename
            with filepath.open("wb") as file_:
                file_.write(self.to_bytes())
    def from_disk(
        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
    ) -> "Lookups":
        """Load lookups from a directory containing a lookups.bin. Will skip
        loading if the file doesn't exist.
        path (str / Path): The directory path.
        RETURNS (Lookups): The loaded lookups.
        DOCS: https://spacy.io/api/lookups#from_disk
        """
        path = ensure_path(path)
        filepath = path / filename
        if filepath.exists():
            with filepath.open("rb") as file_:
                data = file_.read()
            return self.from_bytes(data)
        return self
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -27,12 +27,6 @@ cdef class Morphology:
    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
    cdef int insert(self, MorphAnalysisC tag) except -1
    cdef int assign_untagged(self, TokenC* token) except -1
    cdef int assign_tag(self, TokenC* token, tag) except -1
    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
    cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1
 cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
 cdef list list_features(const MorphAnalysisC* morph)
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -31,43 +31,15 @@ cdef class Morphology:
    VALUE_SEP = ","
    EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0
-    def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None):
+    def __init__(self, StringStore strings):
        self.mem = Pool()
        self.strings = strings
        self.tags = PreshMap()
        self.load_tag_map(tag_map)
        self.lemmatizer = lemmatizer
        self._cache = PreshMapArray(self.n_tags)
        self._exc = {}
        if exc is not None:
            self.load_morph_exceptions(exc)
    def load_tag_map(self, tag_map):
        self.tag_map = {}
        self.reverse_index = {}
        # Add special space symbol. We prefix with underscore, to make sure it
        # always sorts to the end.
        if '_SP' in tag_map:
            space_attrs = tag_map.get('_SP')
        else:
            space_attrs = tag_map.get('SP', {POS: SPACE})
        if '_SP' not in tag_map:
            self.strings.add('_SP')
            tag_map = dict(tag_map)
            tag_map['_SP'] = space_attrs
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
            attrs = self.normalize_attrs(attrs)
            self.add(attrs)
            self.tag_map[tag_str] = dict(attrs)
            self.reverse_index[self.strings.add(tag_str)] = i
        self.tag_names = tuple(sorted(self.tag_map.keys()))
        self.n_tags = len(self.tag_map)
        self._cache = PreshMapArray(self.n_tags)
    def __reduce__(self):
-        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
+        tags = set([self.get(self.strings[s]) for s in self.strings])
-                self.exc), None, None)
+        tags -= set([""])
        return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
    def add(self, features):
        """Insert a morphological analysis in the morphology table, if not
@ -185,115 +157,6 @@ cdef class Morphology:
        else:
            return self.strings[tag.key]
    def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
        if orth not in self.strings:
            return orth
        cdef unicode py_string = self.strings[orth]
        if self.lemmatizer is None:
            return self.strings.add(py_string.lower())
        cdef list lemma_strings
        cdef unicode lemma_string
        # Normalize features into a dict keyed by the field, to make life easier
        # for the lemmatizer. Handles string-to-int conversion too.
        string_feats = {}
        for key, value in morphology.items():
            if value is True:
                name, value = self.strings.as_string(key).split('_', 1)
                string_feats[name] = value
            else:
                string_feats[self.strings.as_string(key)] = self.strings.as_string(value)
        lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats)
        lemma_string = lemma_strings[0]
        lemma = self.strings.add(lemma_string)
        return lemma
    def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
                         force=False):
        """Add a special-case rule to the morphological analyser. Tokens whose
        tag and orth match the rule will receive the specified properties.
        tag (str): The part-of-speech tag to key the exception.
        orth (str): The word-form to key the exception.
        """
        attrs = dict(attrs)
        attrs = self.normalize_attrs(attrs)
        self.add(attrs)
        attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
        self._exc[(tag_str, self.strings.add(orth_str))] = attrs
    cdef int assign_untagged(self, TokenC* token) except -1:
        """Set morphological attributes on a token without a POS tag. Uses
        the lemmatizer's lookup() method, which looks up the string in the
        table provided by the language data as lemma_lookup (if available).
        """
        if token.lemma == 0:
            orth_str = self.strings[token.lex.orth]
            lemma = self.lemmatizer.lookup(orth_str, orth=token.lex.orth)
            token.lemma = self.strings.add(lemma)
    cdef int assign_tag(self, TokenC* token, tag_str) except -1:
        cdef attr_t tag = self.strings.as_int(tag_str)
        if tag in self.reverse_index:
            tag_id = self.reverse_index[tag]
            self.assign_tag_id(token, tag_id)
        else:
            token.tag = tag
    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
        if tag_id > self.n_tags:
            raise ValueError(Errors.E014.format(tag=tag_id))
        # Ensure spaces get tagged as space.
        # It seems pretty arbitrary to put this logic here, but there's really
        # nowhere better. I guess the justification is that this is where the
        # specific word and the tag interact. Still, we should have a better
        # way to enforce this rule, or figure out why the statistical model fails.
        # Related to Issue #220
        if Lexeme.c_check_flag(token.lex, IS_SPACE):
            tag_id = self.reverse_index[self.strings.add('_SP')]
        tag_str = self.tag_names[tag_id]
        features = dict(self.tag_map.get(tag_str, {}))
        if features:
            pos = self.strings.as_int(features.pop(POS))
        else:
            pos = 0
        cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
        if lemma == 0:
            # Ugh, self.lemmatize has opposite arg order from self.lemmatizer :(
            lemma = self.lemmatize(pos, token.lex.orth, features)
            self._cache.set(tag_id, token.lex.orth, <void*>lemma)
        token.lemma = lemma
        token.pos = <univ_pos_t>pos
        token.tag = self.strings[tag_str]
        token.morph = self.add(features)
        if (self.tag_names[tag_id], token.lex.orth) in self._exc:
            self._assign_tag_from_exceptions(token, tag_id)
    cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
        key = (self.tag_names[tag_id], token.lex.orth)
        cdef dict attrs
        attrs = self._exc[key]
        token.pos = attrs.get(POS, token.pos)
        token.lemma = attrs.get(LEMMA, token.lemma)
    def load_morph_exceptions(self, dict morph_rules):
        self._exc = {}
        # Map (form, pos) to attributes
        for tag, exc in morph_rules.items():
            for orth, attrs in exc.items():
                attrs = self.normalize_attrs(attrs)
                self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs)
    @property
    def exc(self):
        # generate the serializable exc in the MORPH_RULES format from the
        # internal tuple-key format
        morph_rules = {}
        for (tag, orth) in sorted(self._exc):
            if not tag in morph_rules:
                morph_rules[tag] = {}
            morph_rules[tag][self.strings[orth]] = self._exc[(tag, orth)]
        return morph_rules
    @staticmethod
    def feats_to_dict(feats):
        if not feats or feats == Morphology.EMPTY_MORPH:
@ -338,3 +201,9 @@ cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t fie
            results[n_results] = morph.features[i]
            n_results += 1
    return n_results
 def unpickle_morphology(strings, tags):
    cdef Morphology morphology = Morphology(strings)
    for tag in tags:
        morphology.add(tag)
    return morphology
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -3,9 +3,10 @@ from .dep_parser import DependencyParser
 from .entity_linker import EntityLinker
 from .ner import EntityRecognizer
 from .entityruler import EntityRuler
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .pipe import Pipe
-from spacy.pipeline.senter import SentenceRecognizer
+from .senter import SentenceRecognizer
 from .sentencizer import Sentencizer
 from .simple_ner import SimpleNER
 from .tagger import Tagger
@ -20,6 +21,7 @@ __all__ = [
    "EntityRecognizer",
    "EntityRuler",
    "Morphologizer",
    "Lemmatizer",
    "Pipe",
    "SentenceRecognizer",
    "Sentencizer",
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -0,0 +1,330 @@
 from typing import Optional, List, Dict, Any
 from thinc.api import Model
 from .pipe import Pipe
 from ..errors import Errors
 from ..language import Language
 from ..lookups import Lookups, load_lookups
 from ..scorer import Scorer
 from ..tokens import Doc, Token
 from ..vocab import Vocab
 from .. import util
@Language.factory(
    "lemmatizer",
    assigns=["token.lemma"],
    default_config={
        "model": None,
        "mode": "lookup",
        "lookups": None,
        "overwrite": False,
    },
    scores=["lemma_acc"],
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
    overwrite: bool = False,
 ):
    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
    return Lemmatizer(
        nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
    )
 class Lemmatizer(Pipe):
    """
    The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
    lookup tables.
    DOCS: https://spacy.io/api/lemmatizer
    """
    @classmethod
    def get_lookups_config(cls, mode: str) -> Dict:
        """Returns the lookups configuration settings for a given mode for use
        in Lemmatizer.load_lookups.
        mode (str): The lemmatizer mode.
        RETURNS (dict): The lookups configuration settings for this mode.
        DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
        """
        if mode == "lookup":
            return {
                "required_tables": ["lemma_lookup"],
            }
        elif mode == "rule":
            return {
                "required_tables": ["lemma_rules"],
                "optional_tables": ["lemma_exc", "lemma_index"],
            }
        return {}
    @classmethod
    def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups:
        """Load and validate lookups tables. If the provided lookups is None,
        load the default lookups tables according to the language and mode
        settings. Confirm that all required tables for the language and mode
        are present.
        lang (str): The language code.
        mode (str): The lemmatizer mode.
        lookups (Lookups): The provided lookups, may be None if the default
            lookups should be loaded.
        RETURNS (Lookups): The Lookups object.
        DOCS: https://spacy.io/api/lemmatizer#get_lookups_config
        """
        config = cls.get_lookups_config(mode)
        required_tables = config.get("required_tables", [])
        optional_tables = config.get("optional_tables", [])
        if lookups is None:
            lookups = load_lookups(lang=lang, tables=required_tables)
            optional_lookups = load_lookups(
                lang=lang, tables=optional_tables, strict=False
            )
            for table in optional_lookups.tables:
                lookups.set_table(table, optional_lookups.get_table(table))
        for table in required_tables:
            if table not in lookups:
                raise ValueError(
                    Errors.E1004.format(
                        mode=mode, tables=required_tables, found=lookups.tables
                    )
                )
        return lookups
    def __init__(
        self,
        vocab: Vocab,
        model: Optional[Model],
        name: str = "lemmatizer",
        *,
        mode: str = "lookup",
        lookups: Optional[Lookups] = None,
        overwrite: bool = False,
    ) -> None:
        """Initialize a Lemmatizer.
        vocab (Vocab): The vocab.
        model (Model): A model (not yet implemented).
        name (str): The component name. Defaults to "lemmatizer".
        mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
        lookups (Lookups): The lookups object containing the (optional) tables
            such as "lemma_rules", "lemma_index", "lemma_exc" and
            "lemma_lookup". Defaults to None
        overwrite (bool): Whether to overwrite existing lemmas. Defaults to
            `False`.
        DOCS: https://spacy.io/api/lemmatizer#init
        """
        self.vocab = vocab
        self.model = model
        self._mode = mode
        self.lookups = lookups if lookups is not None else Lookups()
        self.overwrite = overwrite
        if self.mode == "lookup":
            self.lemmatize = self.lookup_lemmatize
        elif self.mode == "rule":
            self.lemmatize = self.rule_lemmatize
        else:
            try:
                self.lemmatize = getattr(self, f"{self.mode}_lemmatize")
            except AttributeError:
                raise ValueError(Errors.E1003.format(mode=mode))
        self.cache = {}
    @property
    def mode(self):
        return self._mode
    def __call__(self, doc: Doc) -> Doc:
        """Apply the lemmatizer to one document.
        doc (Doc): The Doc to process.
        RETURNS (Doc): The processed Doc.
        DOCS: https://spacy.io/api/lemmatizer#call
        """
        for token in doc:
            if self.overwrite or token.lemma == 0:
                token.lemma_ = self.lemmatize(token)[0]
        return doc
    def pipe(self, stream, *, batch_size=128):
        """Apply the pipe to a stream of documents. This usually happens under
        the hood when the nlp object is called on a text and all components are
        applied to the Doc.
        stream (Iterable[Doc]): A stream of documents.
        batch_size (int): The number of documents to buffer.
        YIELDS (Doc): Processed documents in order.
        DOCS: https://spacy.io/api/lemmatizer#pipe
        """
        for doc in stream:
            doc = self(doc)
            yield doc
    def lookup_lemmatize(self, token: Token) -> List[str]:
        """Lemmatize using a lookup-based approach.
        token (Token): The token to lemmatize.
        RETURNS (list): The available lemmas for the string.
        DOCS: https://spacy.io/api/lemmatizer#lookup_lemmatize
        """
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        result = lookup_table.get(token.text, token.text)
        if isinstance(result, str):
            result = [result]
        return result
    def rule_lemmatize(self, token: Token) -> List[str]:
        """Lemmatize using a rule-based approach.
        token (Token): The token to lemmatize.
        RETURNS (list): The available lemmas for the string.
        DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize
        """
        cache_key = (token.orth, token.pos, token.morph)
        if cache_key in self.cache:
            return self.cache[cache_key]
        string = token.text
        univ_pos = token.pos_.lower()
        if univ_pos in ("", "eol", "space"):
            return [string.lower()]
        # See Issue #435 for example of where this logic is requied.
        if self.is_base_form(token):
            return [string.lower()]
        index_table = self.lookups.get_table("lemma_index", {})
        exc_table = self.lookups.get_table("lemma_exc", {})
        rules_table = self.lookups.get_table("lemma_rules", {})
        if not any(
            (
                index_table.get(univ_pos),
                exc_table.get(univ_pos),
                rules_table.get(univ_pos),
            )
        ):
            if univ_pos == "propn":
                return [string]
            else:
                return [string.lower()]
        index = index_table.get(univ_pos, {})
        exceptions = exc_table.get(univ_pos, {})
        rules = rules_table.get(univ_pos, {})
        orig = string
        string = string.lower()
        forms = []
        oov_forms = []
        for old, new in rules:
            if string.endswith(old):
                form = string[: len(string) - len(old)] + new
                if not form:
                    pass
                elif form in index or not form.isalpha():
                    forms.append(form)
                else:
                    oov_forms.append(form)
        # Remove duplicates but preserve the ordering of applied "rules"
        forms = list(dict.fromkeys(forms))
        # Put exceptions at the front of the list, so they get priority.
        # This is a dodgy heuristic -- but it's the best we can do until we get
        # frequencies on this. We can at least prune out problematic exceptions,
        # if they shadow more frequent analyses.
        for form in exceptions.get(string, []):
            if form not in forms:
                forms.insert(0, form)
        if not forms:
            forms.extend(oov_forms)
        if not forms:
            forms.append(orig)
        self.cache[cache_key] = forms
        return forms
    def is_base_form(self, token: Token) -> bool:
        """Check whether the token is a base form that does not need further
        analysis for lemmatization.
        token (Token): The token.
        RETURNS (bool): Whether the token is a base form.
        DOCS: https://spacy.io/api/lemmatizer#is_base_form
        """
        return False
    def score(self, examples, **kwargs) -> Dict[str, Any]:
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores.
        DOCS: https://spacy.io/api/lemmatizer#score
        """
        return Scorer.score_token_attr(examples, "lemma", **kwargs)
    def to_disk(self, path, *, exclude=tuple()):
        """Save the current state to a directory.
        path (unicode or Path): A path to a directory, which will be created if
            it doesn't exist.
        exclude (list): String names of serialization fields to exclude.
        DOCS: https://spacy.io/api/vocab#to_disk
        """
        serialize = {}
        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
        serialize["lookups"] = lambda p: self.lookups.to_disk(p)
        util.to_disk(path, serialize, exclude)
    def from_disk(self, path, *, exclude=tuple()):
        """Loads state from a directory. Modifies the object in place and
        returns it.
        path (unicode or Path): A path to a directory.
        exclude (list): String names of serialization fields to exclude.
        RETURNS (Vocab): The modified `Vocab` object.
        DOCS: https://spacy.io/api/vocab#to_disk
        """
        deserialize = {}
        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
        deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
        util.from_disk(path, deserialize, exclude)
    def to_bytes(self, *, exclude=tuple()) -> bytes:
        """Serialize the current state to a binary string.
        exclude (list): String names of serialization fields to exclude.
        RETURNS (bytes): The serialized form of the `Vocab` object.
        DOCS: https://spacy.io/api/vocab#to_bytes
        """
        serialize = {}
        serialize["vocab"] = self.vocab.to_bytes
        serialize["lookups"] = self.lookups.to_bytes
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
        """Load state from a binary string.
        bytes_data (bytes): The data to load from.
        exclude (list): String names of serialization fields to exclude.
        RETURNS (Vocab): The `Vocab` object.
        DOCS: https://spacy.io/api/vocab#from_bytes
        """
        deserialize = {}
        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
        deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
        util.from_bytes(bytes_data, deserialize, exclude)
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -39,12 +39,12 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "tagger",
    assigns=["token.tag"],
-    default_config={"model": DEFAULT_TAGGER_MODEL, "set_morphology": False},
+    default_config={"model": DEFAULT_TAGGER_MODEL},
-    scores=["tag_acc", "pos_acc", "lemma_acc"],
+    scores=["tag_acc"],
    default_score_weights={"tag_acc": 1.0},
 )
-def make_tagger(nlp: Language, name: str, model: Model, set_morphology: bool):
+def make_tagger(nlp: Language, name: str, model: Model):
-    return Tagger(nlp.vocab, model, name, set_morphology=set_morphology)
+    return Tagger(nlp.vocab, model, name)
 class Tagger(Pipe):
@ -52,13 +52,14 @@ class Tagger(Pipe):
    DOCS: https://spacy.io/api/tagger
    """
-    def __init__(self, vocab, model, name="tagger", *, set_morphology=False):
+    def __init__(self, vocab, model, name="tagger", *, labels=None):
        """Initialize a part-of-speech tagger.
        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        labels (List): The set of labels. Defaults to None.
        set_morphology (bool): Whether to set morphological features.
        DOCS: https://spacy.io/api/tagger#init
@ -67,7 +68,7 @@ class Tagger(Pipe):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        cfg = {"set_morphology": set_morphology}
+        cfg = {"labels": labels or []}
        self.cfg = dict(sorted(cfg.items()))
    @property
@ -80,7 +81,7 @@ class Tagger(Pipe):
        DOCS: https://spacy.io/api/tagger#labels
        """
-        return tuple(self.vocab.morphology.tag_names)
+        return tuple(self.cfg["labels"])
    def __call__(self, doc):
        """Apply the pipe to a Doc.
@ -150,9 +151,7 @@ class Tagger(Pipe):
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
        cdef int idx = 0
        cdef Vocab vocab = self.vocab
        assign_morphology = self.cfg.get("set_morphology", True)
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
@ -160,15 +159,7 @@ class Tagger(Pipe):
            for j, tag_id in enumerate(doc_tag_ids):
                # Don't clobber preset POS tags
                if doc.c[j].tag == 0:
-                    if doc.c[j].pos == 0 and assign_morphology:
+                    doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
                        # Don't clobber preset lemmas
                        lemma = doc.c[j].lemma
                        vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
                        if lemma != 0 and lemma != doc.c[j].lex.orth:
                            doc.c[j].lemma = lemma
                    else:
                        doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
                idx += 1
            doc.is_tagged = True
    def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
@ -279,55 +270,26 @@ class Tagger(Pipe):
        DOCS: https://spacy.io/api/tagger#begin_training
        """
-        lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
+        tags = set()
        if not any(table in self.vocab.lookups for table in lemma_tables):
            warnings.warn(Warnings.W022)
        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
            langs = ", ".join(util.LEXEME_NORM_LANGS)
            warnings.warn(Warnings.W033.format(model="part-of-speech tagger", langs=langs))
        orig_tag_map = dict(self.vocab.morphology.tag_map)
        new_tag_map = {}
        for example in get_examples():
            try:
                y = example.y
            except AttributeError:
                raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) from None
            for token in y:
-                tag = token.tag_
+                tags.add(token.tag_)
-                if tag in orig_tag_map:
+        for tag in sorted(tags):
-                    new_tag_map[tag] = orig_tag_map[tag]
+            self.add_label(tag)
                else:
                    new_tag_map[tag] = {POS: X}
        cdef Vocab vocab = self.vocab
        if new_tag_map:
            if "_SP" in orig_tag_map:
                new_tag_map["_SP"] = orig_tag_map["_SP"]
            vocab.morphology.load_tag_map(new_tag_map)
        self.set_output(len(self.labels))
-        doc_sample = [Doc(self.vocab, words=["hello", "world"])]
+        self.model.initialize()
        if pipeline is not None:
            for name, component in pipeline:
                if component is self:
                    break
                if hasattr(component, "pipe"):
                    doc_sample = list(component.pipe(doc_sample))
                else:
                    doc_sample = [component(doc) for doc in doc_sample]
        self.model.initialize(X=doc_sample)
        # Get batch of example docs, example outputs to call begin_training().
        # This lets the model infer shapes.
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
-    def add_label(self, label, values=None):
+    def add_label(self, label):
        """Add a new label to the pipe.
        label (str): The label to add.
        values (Dict[int, str]): Optional values to map to the label, e.g. a
            tag map dictionary.
        RETURNS (int): 0 if label is already present, otherwise 1.
        DOCS: https://spacy.io/api/tagger#add_label
@ -336,22 +298,8 @@ class Tagger(Pipe):
            raise ValueError(Errors.E187)
        if label in self.labels:
            return 0
-        if self.model.has_dim("nO"):
+        self.cfg["labels"].append(label)
-            # Here's how the model resizing will work, once the
+        self.vocab.strings.add(label)
            # neuron-to-tag mapping is no longer controlled by
            # the Morphology class, which sorts the tag names.
            # The sorting makes adding labels difficult.
            # smaller = self.model._layers[-1]
            # larger = Softmax(len(self.labels)+1, smaller.nI)
            # copy_array(larger.W[:smaller.nO], smaller.W)
            # copy_array(larger.b[:smaller.nO], smaller.b)
            # self.model._layers[-1] = larger
            raise ValueError(TempErrors.T003)
        tag_map = dict(self.vocab.morphology.tag_map)
        if values is None:
            values = {POS: "X"}
        tag_map[label] = values
        self.vocab.morphology.load_tag_map(tag_map)
        return 1
    def score(self, examples, **kwargs):
@ -363,11 +311,7 @@ class Tagger(Pipe):
        DOCS: https://spacy.io/api/tagger#score
        """
-        scores = {}
+        return Scorer.score_token_attr(examples, "tag", **kwargs)
        scores.update(Scorer.score_token_attr(examples, "tag", **kwargs))
        scores.update(Scorer.score_token_attr(examples, "pos", **kwargs))
        scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
        return scores
    def to_bytes(self, *, exclude=tuple()):
        """Serialize the pipe to a bytestring.
@ -381,10 +325,6 @@ class Tagger(Pipe):
        serialize["model"] = self.model.to_bytes
        serialize["vocab"] = self.vocab.to_bytes
        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
        tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
        serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
        morph_rules = dict(self.vocab.morphology.exc)
        serialize["morph_rules"] = lambda: srsly.msgpack_dumps(morph_rules)
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data, *, exclude=tuple()):
@ -402,21 +342,8 @@ class Tagger(Pipe):
            except AttributeError:
                raise ValueError(Errors.E149) from None
        def load_tag_map(b):
            tag_map = srsly.msgpack_loads(b)
            self.vocab.morphology.load_tag_map(tag_map)
        def load_morph_rules(b):
            morph_rules = srsly.msgpack_loads(b)
            self.vocab.morphology.load_morph_exceptions(morph_rules)
        self.vocab.morphology = Morphology(self.vocab.strings, dict(),
            lemmatizer=self.vocab.morphology.lemmatizer)
        deserialize = {
            "vocab": lambda b: self.vocab.from_bytes(b),
            "tag_map": load_tag_map,
            "morph_rules": load_morph_rules,
            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
            "model": lambda b: load_model(b),
        }
@ -431,12 +358,8 @@ class Tagger(Pipe):
        DOCS: https://spacy.io/api/tagger#to_disk
        """
        tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
        morph_rules = dict(self.vocab.morphology.exc)
        serialize = {
            "vocab": lambda p: self.vocab.to_disk(p),
            "tag_map": lambda p: srsly.write_msgpack(p, tag_map),
            "morph_rules": lambda p: srsly.write_msgpack(p, morph_rules),
            "model": lambda p: self.model.to_disk(p),
            "cfg": lambda p: srsly.write_json(p, self.cfg),
        }
@ -458,22 +381,9 @@ class Tagger(Pipe):
                except AttributeError:
                    raise ValueError(Errors.E149) from None
        def load_tag_map(p):
            tag_map = srsly.read_msgpack(p)
            self.vocab.morphology.load_tag_map(tag_map)
        def load_morph_rules(p):
            morph_rules = srsly.read_msgpack(p)
            self.vocab.morphology.load_morph_exceptions(morph_rules)
        self.vocab.morphology = Morphology(self.vocab.strings, dict(),
            lemmatizer=self.vocab.morphology.lemmatizer)
        deserialize = {
            "vocab": lambda p: self.vocab.from_disk(p),
            "cfg": lambda p: self.cfg.update(deserialize_config(p)),
            "tag_map": load_tag_map,
            "morph_rules": load_morph_rules,
            "model": load_model,
        }
        util.from_disk(path, deserialize, exclude)
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -220,7 +220,6 @@ class ConfigSchemaNlp(BaseModel):
    lang: StrictStr = Field(..., title="The base language to use")
    pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
    tokenizer: Callable = Field(..., title="The tokenizer to use")
    lemmatizer: Callable = Field(..., title="The lemmatizer to use")
    load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
    before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
    after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -201,7 +201,7 @@ def ru_tokenizer():
@pytest.fixture
 def ru_lemmatizer():
    pytest.importorskip("pymorphy2")
-    return get_lang_class("ru")().vocab.morphology.lemmatizer
+    return get_lang_class("ru")().add_pipe("lemmatizer")
@pytest.fixture(scope="session")
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@ -1,21 +1,12 @@
 import pytest
 from spacy.vocab import Vocab
 from spacy.tokens import Doc
 from spacy.lemmatizer import Lemmatizer
 from spacy.lookups import Lookups
 from spacy import util
@pytest.fixture
-def lemmatizer():
+def vocab():
-    lookups = Lookups()
+    return Vocab()
    lookups.add_table("lemma_lookup", {"dogs": "dog", "boxen": "box", "mice": "mouse"})
    return Lemmatizer(lookups)
@pytest.fixture
 def vocab(lemmatizer):
    return Vocab(lemmatizer=lemmatizer)
 def test_empty_doc(vocab):
@ -30,14 +21,6 @@ def test_single_word(vocab):
    assert doc.text == "a"
 def test_lookup_lemmatization(vocab):
    doc = Doc(vocab, words=["dogs", "dogses"])
    assert doc[0].text == "dogs"
    assert doc[0].lemma_ == "dog"
    assert doc[1].text == "dogses"
    assert doc[1].lemma_ == "dogses"
 def test_create_from_words_and_text(vocab):
    # no whitespace in words
    words = ["'", "dogs", "'", "run"]
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@ -1,23 +1,17 @@
 import pytest
 from spacy.symbols import POS, PRON, VERB
@pytest.fixture
 def i_has(en_tokenizer):
    doc = en_tokenizer("I has")
-    tag_map = {
+    doc[0].morph_ = {"PronType": "prs"}
-        "PRP": {POS: PRON, "PronType": "prs"},
+    doc[1].morph_ = {
-        "VBZ": {
+        "VerbForm": "fin",
-            POS: VERB,
+        "Tense": "pres",
-            "VerbForm": "fin",
+        "Number": "sing",
-            "Tense": "pres",
+        "Person": "three",
            "Number": "sing",
            "Person": "three",
        },
    }
-    en_tokenizer.vocab.morphology.load_tag_map(tag_map)
+
    doc[0].tag_ = "PRP"
    doc[1].tag_ = "VBZ"
    return doc
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -124,7 +124,6 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
    assert doc[0].text == "The players"
    assert doc[0].tag_ == "NN"
    assert doc[0].pos_ == "NOUN"
    assert doc[0].lemma_ == "The players"
    doc = get_doc(
        tokens.vocab,
        words=[t.text for t in tokens],
@ -143,11 +142,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
    assert doc[0].text == "The players"
    assert doc[0].tag_ == "NN"
    assert doc[0].pos_ == "NOUN"
    assert doc[0].lemma_ == "The players"
    assert doc[1].text == "start ."
    assert doc[1].tag_ == "VBZ"
    assert doc[1].pos_ == "VERB"
    assert doc[1].lemma_ == "start ."
 def test_doc_retokenize_spans_merge_heads(en_tokenizer):
--- a/spacy/tests/lang/en/test_tagger.py
+++ b/spacy/tests/lang/en/test_tagger.py
@ -1,21 +0,0 @@
 from spacy.symbols import POS, PRON, VERB, DET, NOUN, PUNCT
 from ...util import get_doc
 def test_en_tagger_load_morph_exc(en_tokenizer):
    text = "I like his style."
    tags = ["PRP", "VBP", "PRP$", "NN", "."]
    tag_map = {
        "PRP": {POS: PRON},
        "VBP": {POS: VERB},
        "PRP$": {POS: DET},
        "NN": {POS: NOUN},
        ".": {POS: PUNCT},
    }
    morph_exc = {"VBP": {"like": {"lemma": "luck"}}}
    en_tokenizer.vocab.morphology.load_tag_map(tag_map)
    en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
    tokens = en_tokenizer(text)
    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags)
    assert doc[1].tag_ == "VBP"
    assert doc[1].lemma_ == "luck"
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@ -3,15 +3,16 @@ import pytest
 from ...util import get_doc
-@pytest.mark.xfail(reason="TODO: investigate why lemmatizer fails here")
+def test_ru_doc_lemmatization(ru_lemmatizer):
 def test_ru_doc_lemmatization(ru_tokenizer):
    words = ["мама", "мыла", "раму"]
-    tags = [
+    pos = ["NOUN", "VERB", "NOUN"]
-        "NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
+    morphs = [
-        "VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
+        "Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
-        "NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
+        "Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
        "Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
    ]
-    doc = get_doc(ru_tokenizer.vocab, words=words, tags=tags)
+    doc = get_doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
    doc = ru_lemmatizer(doc)
    lemmas = [token.lemma_ for token in doc]
    assert lemmas == ["мама", "мыть", "рама"]
@ -27,43 +28,51 @@ def test_ru_doc_lemmatization(ru_tokenizer):
    ],
 )
 def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
-    assert sorted(ru_lemmatizer.noun(text)) == lemmas
+    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
    result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
    assert sorted(result_lemmas) == lemmas
@pytest.mark.parametrize(
-    "text,pos,morphology,lemma",
+    "text,pos,morph,lemma",
    [
-        ("рой", "NOUN", None, "рой"),
+        ("рой", "NOUN", "", "рой"),
-        ("рой", "VERB", None, "рыть"),
+        ("рой", "VERB", "", "рыть"),
-        ("клей", "NOUN", None, "клей"),
+        ("клей", "NOUN", "", "клей"),
-        ("клей", "VERB", None, "клеить"),
+        ("клей", "VERB", "", "клеить"),
-        ("три", "NUM", None, "три"),
+        ("три", "NUM", "", "три"),
-        ("кос", "NOUN", {"Number": "Sing"}, "кос"),
+        ("кос", "NOUN", "Number=Sing", "кос"),
-        ("кос", "NOUN", {"Number": "Plur"}, "коса"),
+        ("кос", "NOUN", "Number=Plur", "коса"),
-        ("кос", "ADJ", None, "косой"),
+        ("кос", "ADJ", "", "косой"),
-        ("потом", "NOUN", None, "пот"),
+        ("потом", "NOUN", "", "пот"),
-        ("потом", "ADV", None, "потом"),
+        ("потом", "ADV", "", "потом"),
    ],
 )
 def test_ru_lemmatizer_works_with_different_pos_homonyms(
-    ru_lemmatizer, text, pos, morphology, lemma
+    ru_lemmatizer, text, pos, morph, lemma
 ):
-    assert ru_lemmatizer(text, pos, morphology) == [lemma]
+    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
    result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
    assert result_lemmas == [lemma]
@pytest.mark.parametrize(
-    "text,morphology,lemma",
+    "text,morph,lemma",
    [
-        ("гвоздики", {"Gender": "Fem"}, "гвоздика"),
+        ("гвоздики", "Gender=Fem", "гвоздика"),
-        ("гвоздики", {"Gender": "Masc"}, "гвоздик"),
+        ("гвоздики", "Gender=Masc", "гвоздик"),
-        ("вина", {"Gender": "Fem"}, "вина"),
+        ("вина", "Gender=Fem", "вина"),
-        ("вина", {"Gender": "Neut"}, "вино"),
+        ("вина", "Gender=Neut", "вино"),
    ],
 )
-def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma):
+def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma):
-    assert ru_lemmatizer.noun(text, morphology) == [lemma]
+    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
    result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
    assert result_lemmas == [lemma]
 def test_ru_lemmatizer_punct(ru_lemmatizer):
-    assert ru_lemmatizer.punct("«") == ['"']
+    doc = get_doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
-    assert ru_lemmatizer.punct("»") == ['"']
+    assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
    doc = get_doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
    assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
--- a/spacy/tests/lang/test_lemmatizers.py
+++ b/spacy/tests/lang/test_lemmatizers.py
@ -0,0 +1,34 @@
 import pytest
 from spacy import registry
 from spacy.lookups import Lookups
 from spacy.util import get_lang_class
 # fmt: off
 # Only include languages with no external dependencies
 # excluded: ru, uk
 # excluded for custom tables: pl
 LANGUAGES = ["el", "en", "fr", "nl"]
 # fmt: on
@pytest.mark.parametrize("lang", LANGUAGES)
 def test_lemmatizer_initialize(lang, capfd):
    @registry.assets("lemmatizer_init_lookups")
    def lemmatizer_init_lookups():
        lookups = Lookups()
        lookups.add_table("lemma_lookup", {"cope": "cope"})
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
        return lookups
    """Test that languages can be initialized."""
    nlp = get_lang_class(lang)()
    nlp.add_pipe(
        "lemmatizer", config={"lookups": {"@assets": "lemmatizer_init_lookups"}}
    )
    # Check for stray print statements (see #3342)
    doc = nlp("test")  # noqa: F841
    captured = capfd.readouterr()
    assert not captured.out
--- a/spacy/tests/morphology/test_morph_features.py
+++ b/spacy/tests/morphology/test_morph_features.py
@ -1,14 +1,11 @@
 import pytest
 from spacy.morphology import Morphology
 from spacy.strings import StringStore, get_string_id
 from spacy.lemmatizer import Lemmatizer
 from spacy.lookups import Lookups
@pytest.fixture
 def morphology():
-    lemmatizer = Lemmatizer(Lookups())
+    return Morphology(StringStore())
    return Morphology(StringStore(), {}, lemmatizer)
 def test_init(morphology):
--- a/spacy/tests/morphology/test_morph_pickle.py
+++ b/spacy/tests/morphology/test_morph_pickle.py
@ -2,21 +2,18 @@ import pytest
 import pickle
 from spacy.morphology import Morphology
 from spacy.strings import StringStore
 from spacy.lemmatizer import Lemmatizer
 from spacy.lookups import Lookups
@pytest.fixture
 def morphology():
-    tag_map = {"A": {"POS": "X"}, "B": {"POS": "NOUN"}}
+    morphology = Morphology(StringStore())
-    exc = {"A": {"a": {"POS": "VERB"}}}
+    morphology.add("Feat1=Val1|Feat2=Val2")
-    lemmatizer = Lemmatizer(Lookups())
+    morphology.add("Feat3=Val3|Feat4=Val4")
-    return Morphology(StringStore(), tag_map, lemmatizer, exc=exc)
+    return morphology
 def test_morphology_pickle_roundtrip(morphology):
    b = pickle.dumps(morphology)
    reloaded_morphology = pickle.loads(b)
-
+    assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2"
-    assert morphology.tag_map == reloaded_morphology.tag_map
+    assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4"
    assert morphology.exc == reloaded_morphology.exc
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -82,10 +82,10 @@ def test_parser_merge_pp(en_tokenizer):
    text = "A phrase with another phrase occurs"
    heads = [1, 4, -1, 1, -2, 0]
    deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"]
-    tags = ["DT", "NN", "IN", "DT", "NN", "VBZ"]
+    pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"]
    tokens = en_tokenizer(text)
    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags
+        tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos,
    )
    with doc.retokenize() as retokenizer:
        for np in doc.noun_chunks:
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@ -0,0 +1,109 @@
 import pytest
 from spacy import util, registry
 from spacy.lang.en import English
 from spacy.lookups import Lookups, load_lookups
 from ..util import make_tempdir
@pytest.fixture
 def nlp():
    return English()
@pytest.fixture
 def lemmatizer(nlp):
    @registry.assets("cope_lookups")
    def cope_lookups():
        lookups = Lookups()
        lookups.add_table("lemma_lookup", {"cope": "cope"})
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
        return lookups
    lemmatizer = nlp.add_pipe(
        "lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
    )
    return lemmatizer
 def test_lemmatizer_init(nlp):
    @registry.assets("cope_lookups")
    def cope_lookups():
        lookups = Lookups()
        lookups.add_table("lemma_lookup", {"cope": "cope"})
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
        return lookups
    lemmatizer = nlp.add_pipe(
        "lemmatizer", config={"mode": "lookup", "lookups": {"@assets": "cope_lookups"}}
    )
    assert isinstance(lemmatizer.lookups, Lookups)
    assert lemmatizer.mode == "lookup"
    # replace any tables from spacy-lookups-data
    lemmatizer.lookups = Lookups()
    doc = nlp("coping")
    # lookup with no tables sets text as lemma
    assert doc[0].lemma_ == "coping"
    nlp.remove_pipe("lemmatizer")
    @registry.assets("empty_lookups")
    def empty_lookups():
        return Lookups()
    with pytest.raises(ValueError):
        nlp.add_pipe(
            "lemmatizer",
            config={"mode": "lookup", "lookups": {"@assets": "empty_lookups"}},
        )
 def test_lemmatizer_config(nlp, lemmatizer):
    doc = nlp.make_doc("coping")
    doc[0].pos_ = "VERB"
    assert doc[0].lemma_ == ""
    doc = lemmatizer(doc)
    assert doc[0].text == "coping"
    assert doc[0].lemma_ == "cope"
    doc = nlp.make_doc("coping")
    doc[0].pos_ = "VERB"
    assert doc[0].lemma_ == ""
    doc = lemmatizer(doc)
    assert doc[0].text == "coping"
    assert doc[0].lemma_ == "cope"
 def test_lemmatizer_serialize(nlp, lemmatizer):
    @registry.assets("cope_lookups")
    def cope_lookups():
        lookups = Lookups()
        lookups.add_table("lemma_lookup", {"cope": "cope"})
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
        return lookups
    nlp2 = English()
    lemmatizer2 = nlp2.add_pipe(
        "lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
    )
    lemmatizer2.from_bytes(lemmatizer.to_bytes())
    assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
    assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2.make_doc("coping")
        doc2[0].pos_ = "VERB"
        assert doc2[0].lemma_ == ""
        doc2 = lemmatizer(doc2)
        assert doc2[0].text == "coping"
        assert doc2[0].lemma_ == "cope"
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@ -23,13 +23,12 @@ def test_tagger_begin_training_tag_map():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    orig_tag_count = len(tagger.labels)
-    tagger.add_label("A", {"POS": "NOUN"})
+    tagger.add_label("A")
    nlp.begin_training()
    assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN}
    assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
-TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
+TAGS = ("N", "V", "J")
 MORPH_RULES = {"V": {"like": {"lemma": "luck"}}}
@ -42,15 +41,12 @@ TRAIN_DATA = [
 def test_overfitting_IO():
    # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
    nlp = English()
-    nlp.vocab.morphology.load_tag_map(TAG_MAP)
+    tagger = nlp.add_pipe("tagger")
    nlp.vocab.morphology.load_morph_exceptions(MORPH_RULES)
    tagger = nlp.add_pipe("tagger", config={"set_morphology": True})
    nlp.vocab.morphology.load_tag_map(TAG_MAP)
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    for tag, values in TAG_MAP.items():
+    for tag in TAGS:
-        tagger.add_label(tag, values)
+        tagger.add_label(tag)
    optimizer = nlp.begin_training()
    for i in range(50):
@ -65,7 +61,6 @@ def test_overfitting_IO():
    assert doc[1].tag_ is "V"
    assert doc[2].tag_ is "J"
    assert doc[3].tag_ is "N"
    assert doc[1].lemma_ == "luck"
    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
@ -76,4 +71,3 @@ def test_overfitting_IO():
        assert doc2[1].tag_ is "V"
        assert doc2[2].tag_ is "J"
        assert doc2[3].tag_ is "N"
        assert doc[1].lemma_ == "luck"
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@ -8,10 +8,8 @@ from spacy.attrs import IS_PUNCT, ORTH, LOWER
 from spacy.symbols import POS, VERB
 from spacy.vocab import Vocab
 from spacy.lang.en import English
 from spacy.lemmatizer import Lemmatizer
 from spacy.lookups import Lookups
 from spacy.tokens import Doc, Span
 from spacy.lang.en.lemmatizer import is_base_form
 from ..util import get_doc, make_tempdir
@ -157,16 +155,15 @@ def test_issue590(en_vocab):
    assert len(matches) == 2
@pytest.mark.skip(reason="Old vocab-based lemmatization")
 def test_issue595():
    """Test lemmatization of base forms"""
    words = ["Do", "n't", "feed", "the", "dog"]
    tag_map = {"VB": {POS: VERB, "VerbForm": "inf"}}
    lookups = Lookups()
    lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
    lookups.add_table("lemma_index", {"verb": {}})
    lookups.add_table("lemma_exc", {"verb": {}})
-    lemmatizer = Lemmatizer(lookups, is_base_form=is_base_form)
+    vocab = Vocab()
    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
    doc = Doc(vocab, words=words)
    doc[2].tag_ = "VB"
    assert doc[2].text == "feed"
@ -389,6 +386,7 @@ def test_issue891(en_tokenizer, text):
    assert tokens[1].text == "/"
@pytest.mark.skip(reason="Old vocab-based lemmatization")
@pytest.mark.parametrize(
    "text,tag,lemma",
    [("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")],
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ b/spacy/tests/regression/test_issue1001-1500.py
@ -6,7 +6,6 @@ from spacy.lang.en import English
 from spacy.lang.lex_attrs import LEX_ATTRS
 from spacy.matcher import Matcher
 from spacy.tokenizer import Tokenizer
 from spacy.lemmatizer import Lemmatizer
 from spacy.lookups import Lookups
 from spacy.symbols import ORTH, LEMMA, POS, VERB
@ -57,6 +56,7 @@ def test_issue1242():
    assert len(docs[1]) == 1
@pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases")
 def test_issue1250():
    """Test cached special cases."""
    special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
@ -87,20 +87,6 @@ def test_issue1375():
    assert doc[1].nbor(1).text == "2"
 def test_issue1387():
    tag_map = {"VBG": {POS: VERB, "VerbForm": "part"}}
    lookups = Lookups()
    lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
    lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
    lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
    lemmatizer = Lemmatizer(lookups)
    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
    doc = Doc(vocab, words=["coping"])
    doc[0].tag_ = "VBG"
    assert doc[0].text == "coping"
    assert doc[0].lemma_ == "cope"
 def test_issue1434():
    """Test matches occur when optional element at end of short doc."""
    pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -130,8 +130,6 @@ def test_issue1727():
    vectors = Vectors(data=data, keys=["I", "am", "Matt"])
    tagger = nlp.create_pipe("tagger")
    tagger.add_label("PRP")
    with pytest.warns(UserWarning):
        tagger.begin_training()
    assert tagger.cfg.get("pretrained_dims", 0) == 0
    tagger.vocab.vectors = vectors
    with make_tempdir() as path:
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -19,8 +19,8 @@ def test_issue2564():
    """Test the tagger sets is_tagged correctly when used via Language.pipe."""
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
-    with pytest.warns(UserWarning):
+    tagger.add_label("A")
-        tagger.begin_training()  # initialise weights
+    tagger.begin_training()
    doc = nlp("hello world")
    assert doc.is_tagged
    docs = nlp.pipe(["hello", "world"])
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -241,11 +241,11 @@ def test_issue3449():
    assert t3[5].text == "I"
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3456():
    # this crashed because of a padding error in layer.ops.unflatten in thinc
    nlp = English()
-    nlp.add_pipe("tagger")
+    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
    nlp.begin_training()
    list(nlp.pipe(["hi", ""]))
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -149,13 +149,15 @@ def test_issue3540(en_vocab):
    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.text for token in doc] == gold_text
    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
    for i, lemma in enumerate(gold_lemma):
        doc[i].lemma_ = lemma
    assert [token.lemma_ for token in doc] == gold_lemma
    vectors_1 = [token.vector for token in doc]
    assert len(vectors_1) == len(doc)
    with doc.retokenize() as retokenizer:
        heads = [(doc[3], 1), doc[2]]
-        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
+        attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]}
        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -271,6 +271,7 @@ def test_issue4267():
        assert token.ent_iob == 2
@pytest.mark.skip(reason="lemmatizer lookups no longer in vocab")
 def test_issue4272():
    """Test that lookup table can be accessed from Token.lemma if no POS tags
    are available."""
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -62,8 +62,7 @@ def tagger():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    with pytest.warns(UserWarning):
+    tagger.begin_training(pipeline=nlp.pipeline)
        tagger.begin_training(pipeline=nlp.pipeline)
    return tagger
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@ -44,8 +44,8 @@ def blank_parser(en_vocab):
 def taggers(en_vocab):
    cfg = {"model": DEFAULT_TAGGER_MODEL}
    model = registry.make_from_config(cfg, validate=True)["model"]
-    tagger1 = Tagger(en_vocab, model, set_morphology=True)
+    tagger1 = Tagger(en_vocab, model)
-    tagger2 = Tagger(en_vocab, model, set_morphology=True)
+    tagger2 = Tagger(en_vocab, model)
    return tagger1, tagger2
@ -125,8 +125,8 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
        tagger2.to_disk(file_path2)
        cfg = {"model": DEFAULT_TAGGER_MODEL}
        model = registry.make_from_config(cfg, validate=True)["model"]
-        tagger1_d = Tagger(en_vocab, model, set_morphology=True).from_disk(file_path1)
+        tagger1_d = Tagger(en_vocab, model).from_disk(file_path1)
-        tagger2_d = Tagger(en_vocab, model, set_morphology=True).from_disk(file_path2)
+        tagger2_d = Tagger(en_vocab, model).from_disk(file_path2)
        assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@ -8,7 +8,6 @@ from ..util import make_tempdir
 test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
 test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
 default_strings = ("_SP", "POS=SPACE")
@pytest.mark.parametrize("text", ["rat"])
@ -34,10 +33,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
    assert vocab1.to_bytes() == vocab1_b
    new_vocab1 = Vocab().from_bytes(vocab1_b)
    assert new_vocab1.to_bytes() == vocab1_b
-    assert len(new_vocab1.strings) == len(strings1) + 2  # adds _SP and POS=SPACE
+    assert len(new_vocab1.strings) == len(strings1)
-    assert sorted([s for s in new_vocab1.strings]) == sorted(
+    assert sorted([s for s in new_vocab1.strings]) == sorted(strings1)
        strings1 + list(default_strings)
    )
@pytest.mark.parametrize("strings1,strings2", test_strings)
@ -52,16 +49,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
        vocab1_d = Vocab().from_disk(file_path1)
        vocab2_d = Vocab().from_disk(file_path2)
        # check strings rather than lexemes, which are only reloaded on demand
-        assert strings1 == [s for s in vocab1_d.strings if s not in default_strings]
+        assert strings1 == [s for s in vocab1_d.strings]
-        assert strings2 == [s for s in vocab2_d.strings if s not in default_strings]
+        assert strings2 == [s for s in vocab2_d.strings]
        if strings1 == strings2:
-            assert [s for s in vocab1_d.strings if s not in default_strings] == [
+            assert [s for s in vocab1_d.strings] == [s for s in vocab2_d.strings]
                s for s in vocab2_d.strings if s not in default_strings
            ]
        else:
-            assert [s for s in vocab1_d.strings if s not in default_strings] != [
+            assert [s for s in vocab1_d.strings] != [s for s in vocab2_d.strings]
                s for s in vocab2_d.strings if s not in default_strings
            ]
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@ -80,7 +73,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
    # Reported in #2153
    vocab = Vocab(strings=strings)
    vocab.from_bytes(vocab.to_bytes())
-    assert len(vocab.strings) == len(strings) + 2  # adds _SP and POS=SPACE
+    assert len(vocab.strings) == len(strings)
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
--- a/spacy/tests/test_lemmatizer.py
+++ b/spacy/tests/test_lemmatizer.py
@ -1,64 +0,0 @@
 import pytest
 from spacy.tokens import Doc
 from spacy.language import Language
 from spacy.lookups import Lookups
 from spacy.lemmatizer import Lemmatizer
@pytest.mark.skip(reason="We probably don't want to support this anymore in v3?")
 def test_lemmatizer_reflects_lookups_changes():
    """Test for an issue that'd cause lookups available in a model loaded from
    disk to not be reflected in the lemmatizer."""
    nlp = Language()
    assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "foo"
    table = nlp.vocab.lookups.add_table("lemma_lookup")
    table["foo"] = "bar"
    assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "bar"
    table = nlp.vocab.lookups.get_table("lemma_lookup")
    table["hello"] = "world"
    # The update to the table should be reflected in the lemmatizer
    assert Doc(nlp.vocab, words=["hello"])[0].lemma_ == "world"
    new_nlp = Language()
    table = new_nlp.vocab.lookups.add_table("lemma_lookup")
    table["hello"] = "hi"
    assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "hi"
    nlp_bytes = nlp.to_bytes()
    new_nlp.from_bytes(nlp_bytes)
    # Make sure we have the previously saved lookup table
    assert "lemma_lookup" in new_nlp.vocab.lookups
    assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2
    assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world"
    assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar"
    assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world"
 def test_tagger_warns_no_lookups():
    nlp = Language()
    nlp.vocab.lookups = Lookups()
    assert not len(nlp.vocab.lookups)
    tagger = nlp.add_pipe("tagger")
    with pytest.warns(UserWarning):
        tagger.begin_training()
    with pytest.warns(UserWarning):
        nlp.begin_training()
    nlp.vocab.lookups.add_table("lemma_lookup")
    nlp.vocab.lookups.add_table("lexeme_norm")
    nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
    with pytest.warns(None) as record:
        nlp.begin_training()
        assert not record.list
 def test_lemmatizer_without_is_base_form_implementation():
    # Norwegian example from #5658
    lookups = Lookups()
    lookups.add_table("lemma_rules", {"noun": []})
    lookups.add_table("lemma_index", {"noun": {}})
    lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}})
    lemmatizer = Lemmatizer(lookups, is_base_form=None)
    assert lemmatizer(
        "Formuesskatten",
        "noun",
        {"Definite": "def", "Gender": "masc", "Number": "sing"},
    ) == ["formuesskatt"]
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -112,16 +112,15 @@ def test_tokenizer_validate_special_case(tokenizer, text, tokens):
@pytest.mark.parametrize(
-    "text,tokens", [("lorem", [{"orth": "lo", "tag": "NN"}, {"orth": "rem"}])]
+    "text,tokens", [("lorem", [{"orth": "lo", "norm": "LO"}, {"orth": "rem"}])]
 )
 def test_tokenizer_add_special_case_tag(text, tokens):
-    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
+    vocab = Vocab()
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]["orth"]
-    assert doc[0].tag_ == tokens[0]["tag"]
+    assert doc[0].norm_ == tokens[0]["norm"]
    assert doc[0].pos_ == "NOUN"
    assert doc[1].text == tokens[1]["orth"]
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -11,7 +11,7 @@ from .span cimport Span
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..structs cimport LexemeC, TokenC
-from ..attrs cimport TAG, MORPH
+from ..attrs cimport MORPH
 from ..vocab cimport Vocab
 from .underscore import is_writable_attr
@ -365,8 +365,6 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
                    doc[token_index + i]._.set(ext_attr_key, ext_attr_value)
            # NB: We need to call get_string_id here because only the keys are
            # "intified" (since we support "KEY": [value, value] syntax here).
            elif attr_name == TAG:
                doc.vocab.morphology.assign_tag(token, get_string_id(attr_value))
            else:
                # Set attributes on both token and lexeme to take care of token
                # attribute vs. lexical attribute without having to enumerate
@ -431,8 +429,6 @@ def set_token_attrs(Token py_token, attrs):
        if attr_name == "_":  # Set extension attributes
            for ext_attr_key, ext_attr_value in attr_value.items():
                py_token._.set(ext_attr_key, ext_attr_value)
        elif attr_name == TAG:
            doc.vocab.morphology.assign_tag(token, attr_value)
        else:
            # Set attributes on both token and lexeme to take care of token
            # attribute vs. lexical attribute without having to enumerate
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -832,13 +832,6 @@ cdef class Doc:
                            rel_head_index=abs_head_index-i
                        )
                    )
        # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
        if TAG in attrs:
            col = attrs.index(TAG)
            for i in range(length):
                value = values[col * stride + i]
                if value != 0:
                    self.vocab.morphology.assign_tag(&tokens[i], value)
        # Verify ENT_IOB are proper integers
        if ENT_IOB in attrs:
            iob_strings = Token.iob_strings()
@ -857,12 +850,11 @@ cdef class Doc:
        for i in range(length):
            token = &self.c[i]
            for j in range(n_attrs):
-                if attr_ids[j] != TAG:
+                value = values[j * stride + i]
-                    value = values[j * stride + i]
+                if attr_ids[j] == MORPH:
-                    if attr_ids[j] == MORPH:
+                    # add morph to morphology table
-                        # add morph to morphology table
+                    self.vocab.morphology.add(self.vocab.strings[value])
-                        self.vocab.morphology.add(self.vocab.strings[value])
+                Token.set_struct_attr(token, attr_ids[j], value)
                    Token.set_struct_attr(token, attr_ids[j], value)
        # Set flags
        self.is_parsed = bool(self.is_parsed or HEAD in attrs)
        self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -332,11 +332,7 @@ cdef class Token:
            inflectional suffixes.
        """
        def __get__(self):
-            if self.c.lemma == 0:
+            return self.c.lemma
                lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
                return self.vocab.strings[lemma_]
            else:
                return self.c.lemma
        def __set__(self, attr_t lemma):
            self.c.lemma = lemma
@ -355,7 +351,7 @@ cdef class Token:
            return self.c.tag
        def __set__(self, attr_t tag):
-            self.vocab.morphology.assign_tag(self.c, tag)
+            self.c.tag = tag
    property dep:
        """RETURNS (uint64): ID of syntactic dependency label."""
@ -888,10 +884,7 @@ cdef class Token:
            with no inflectional suffixes.
        """
        def __get__(self):
-            if self.c.lemma == 0:
+            return self.vocab.strings[self.c.lemma]
                return self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
            else:
                return self.vocab.strings[self.c.lemma]
        def __set__(self, unicode lemma_):
            self.c.lemma = self.vocab.strings.add(lemma_)
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -9,11 +9,10 @@ from .lexeme cimport EMPTY_LEXEME, OOV_RANK
 from .lexeme cimport Lexeme
 from .typedefs cimport attr_t
 from .tokens.token cimport Token
-from .attrs cimport LANG, ORTH, TAG, POS
+from .attrs cimport LANG, ORTH
 from .compat import copy_reg
 from .errors import Errors
 from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs, NORM, IS_STOP
 from .vectors import Vectors
 from .util import registry
@ -23,7 +22,7 @@ from .lang.norm_exceptions import BASE_NORMS
 from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
-def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True):
+def create_vocab(lang, defaults, vectors_name=None, load_data=True):
    # If the spacy-lookups-data package is installed, we pre-populate the lookups
    # with lexeme data, if available
    if load_data:
@ -43,7 +42,6 @@ def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=T
    )
    return Vocab(
        lex_attr_getters=lex_attrs,
        lemmatizer=lemmatizer,
        lookups=lookups,
        writing_system=defaults.writing_system,
        get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
@ -58,17 +56,13 @@ cdef class Vocab:
    DOCS: https://spacy.io/api/vocab
    """
-    def __init__(self, lex_attr_getters=None, lemmatizer=None,
+    def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
                 strings=tuple(), lookups=None, tag_map={},
                 oov_prob=-20., vectors_name=None, writing_system={},
                 get_noun_chunks=None, **deprecated_kwargs):
        """Create the vocabulary.
        lex_attr_getters (dict): A dictionary mapping attribute IDs to
            functions to compute them. Defaults to `None`.
        tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained
            parts-of-speech, and optionally morphological attributes.
        lemmatizer (object): A lemmatizer. Defaults to `None`.
        strings (StringStore): StringStore that maps strings to integers, and
            vice versa.
        lookups (Lookups): Container for large lookup tables and dictionaries.
@ -78,8 +72,6 @@ cdef class Vocab:
        lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
        if lookups in (None, True, False):
            lookups = Lookups()
        if lemmatizer in (None, True, False):
            lemmatizer = Lemmatizer(lookups)
        self.cfg = {'oov_prob': oov_prob}
        self.mem = Pool()
        self._by_orth = PreshMap()
@ -89,7 +81,7 @@ cdef class Vocab:
            for string in strings:
                _ = self[string]
        self.lex_attr_getters = lex_attr_getters
-        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
+        self.morphology = Morphology(self.strings)
        self.vectors = Vectors(name=vectors_name)
        self.lookups = lookups
        self.writing_system = writing_system
@ -268,12 +260,6 @@ cdef class Vocab:
            # Set the special tokens up to have arbitrary attributes
            lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
            token.lex = lex
            if TAG in props:
                self.morphology.assign_tag(token, props[TAG])
            elif POS in props:
                # Don't allow POS to be set without TAG -- this causes problems,
                # see #1773
                props.pop(POS)
            for attr_id, value in props.items():
                Token.set_struct_attr(token, attr_id, value)
                # NORM is the only one that overlaps between the two
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@ -1,102 +1,263 @@
 ---
 title: Lemmatizer
 teaser: Assign the base forms of words
 tag: class
-source: spacy/lemmatizer.py
+source: spacy/pipeline/lemmatizer.py
 new: 3
 teaser: 'Pipeline component for lemmatization'
 api_base_class: /api/pipe
 api_string_name: lemmatizer
 api_trainable: false
 ---
-<!-- TODO: rewrite once it's converted to pipe -->
+## Config and implementation
-The `Lemmatizer` supports simple part-of-speech-sensitive suffix rules and
+The default config is defined by the pipeline component factory and describes
-lookup tables.
+how the component should be configured. You can override its settings via the
 `config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
 [`config.cfg` for training](/usage/training#config).
 For examples of the lookups data formats used by the lookup and rule-based
 lemmatizers, see the
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
 > #### Example
 >
 > ```python
 > config = {"mode": "rule"}
 > nlp.add_pipe("lemmatizer", config=config)
 > ```
 | Setting     | Type                                       | Description                                                                                                                                                                    | Default    |
 | ----------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------- |
 | `mode`      | str                                        | The lemmatizer mode, e.g. "lookup" or "rule".                                                                                                                                  | `"lookup"` |
 | `lookups`   | [`Lookups`](/api/lookups)                  | The lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". If `None`, default tables are loaded from `spacy-lookups-data`. | `None`     |
 | `overwrite` | bool                                       | Whether to overwrite existing lemmas.                                                                                                                                          | `False`    |
 | `model`     | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use.                                                                                                                                     | `None`     |
 ```python
 https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/lemmatizer.py
 ```
 ## Lemmatizer.\_\_init\_\_ {#init tag="method"}
 Initialize a `Lemmatizer`. Typically, this happens under the hood within spaCy
 when a `Language` subclass and its `Vocab` is initialized.
 > #### Example
 >
 > ```python
-> from spacy.lemmatizer import Lemmatizer
+> # Construction via add_pipe with default model
-> from spacy.lookups import Lookups
+> lemmatizer = nlp.add_pipe("lemmatizer")
 > lookups = Lookups()
 > lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
 > lemmatizer = Lemmatizer(lookups)
 > ```
 >
-> For examples of the data format, see the
+> # Construction via add_pipe with custom settings
-> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
+> config = {"mode": "rule", overwrite=True}
 > lemmatizer = nlp.add_pipe("lemmatizer", config=config)
 > ```
-| Name                                   | Type                      | Description                                                                                                               |
+Create a new pipeline instance. In your application, you would normally use a
-| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+shortcut for this and instantiate the component using its string name and
-| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. |
+[`nlp.add_pipe`](/api/language#add_pipe).
 | Name           | Type                                       | Description                                                                                                                      |
 | -------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`        | [`Vocab`](/api/vocab)                      | The vocab.                                                                                                                       |
 | `model`        | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented).                                                                                                   |
 | `name`         | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                      |
 | _keyword-only_ |                                            |                                                                                                                                  |
 | mode           | str                                        | The lemmatizer mode, e.g. "lookup" or "rule". Defaults to "lookup".                                                              |
 | lookups        | [`Lookups`](/api/lookups)                  | A lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". Defaults to `None`. |
 | overwrite      | bool                                       | Whether to overwrite existing lemmas.                                                                                            |
 ## Lemmatizer.\_\_call\_\_ {#call tag="method"}
-Lemmatize a string.
+Apply the pipe to one document. The document is modified in place, and returned.
 This usually happens under the hood when the `nlp` object is called on a text
 and all pipeline components are applied to the `Doc` in order.
 > #### Example
 >
 > ```python
-> from spacy.lemmatizer import Lemmatizer
+> doc = nlp("This is a sentence.")
-> from spacy.lookups import Lookups
+> lemmatizer = nlp.add_pipe("lemmatizer")
-> lookups = Lookups()
+> # This usually happens under the hood
-> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
+> processed = lemmatizer(doc)
 > lemmatizer = Lemmatizer(lookups)
 > lemmas = lemmatizer("ducks", "NOUN")
 > assert lemmas == ["duck"]
 > ```
-| Name         | Type          | Description                                                                                              |
+| Name        | Type  | Description              |
-| ------------ | ------------- | -------------------------------------------------------------------------------------------------------- |
+| ----------- | ----- | ------------------------ |
-| `string`     | str           | The string to lemmatize, e.g. the token text.                                                            |
+| `doc`       | `Doc` | The document to process. |
-| `univ_pos`   | str / int     | The token's universal part-of-speech tag.                                                                |
+| **RETURNS** | `Doc` | The processed document.  |
 | `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. |
 | **RETURNS**  | list          | The available lemmas for the string.                                                                     |
-## Lemmatizer.lookup {#lookup tag="method" new="2"}
+## Lemmatizer.pipe {#pipe tag="method"}
-Look up a lemma in the lookup table, if available. If no lemma is found, the
+Apply the pipe to a stream of documents. This usually happens under the hood
 when the `nlp` object is called on a text and all pipeline components are
 applied to the `Doc` in order.
 > #### Example
 >
 > ```python
 > lemmatizer = nlp.add_pipe("lemmatizer")
 > for doc in lemmatizer.pipe(docs, batch_size=50):
 >     pass
 > ```
 | Name           | Type            | Description                                            |
 | -------------- | --------------- | ------------------------------------------------------ |
 | `stream`       | `Iterable[Doc]` | A stream of documents.                                 |
 | _keyword-only_ |                 |                                                        |
 | `batch_size`   | int             | The number of texts to buffer. Defaults to `128`.      |
 | **YIELDS**     | `Doc`           | Processed documents in the order of the original text. |
 ## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
 Lemmatize a token using a lookup-based approach. If no lemma is found, the
 original string is returned. Languages can provide a
 [lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.
-> #### Example
+| Name        | Type                  | Description                           |
->
+| ----------- | --------------------- | ------------------------------------- |
-> ```python
+| `token`     | [`Token`](/api/token) | The token to lemmatize.               |
-> lookups = Lookups()
+| **RETURNS** | `List[str]`           | A list containing one or more lemmas. |
 > lookups.add_table("lemma_lookup", {"going": "go"})
 > assert lemmatizer.lookup("going") == "go"
 > ```
-| Name        | Type | Description                                                                                                 |
+## Lemmatizer.rule_lemmatize {#rule_lemmatize tag="method"}
-| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- |
+
-| `string`    | str  | The string to look up.                                                                                      |
+Lemmatize a token using a rule-based approach. Typically relies on POS tags.
-| `orth`      | int  | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
+
-| **RETURNS** | str  | The lemma if the string was found, otherwise the original string.                                           |
+| Name        | Type                  | Description                           |
 | ----------- | --------------------- | ------------------------------------- |
 | `token`     | [`Token`](/api/token) | The token to lemmatize.               |
 | **RETURNS** | `List[str]`           | A list containing one or more lemmas. |
 ## Lemmatizer.is_base_form {#is_base_form tag="method"}
 Check whether we're dealing with an uninflected paradigm, so we can avoid
 lemmatization entirely.
 | Name        | Type                  | Description                                                                                             |
 | ----------- | --------------------- | ------------------------------------------------------------------------------------------------------- |
 | `token`     | [`Token`](/api/token) | The token to analyze.                                                                                   |
 | **RETURNS** | bool                  | Whether the token's attributes (e.g., part-of-speech tag, morphological features) describe a base form. |
 ## Lemmatizer.get_lookups_config {#get_lookups_config tag="classmethod"}
 Returns the lookups configuration settings for a given mode for use in
 [`Lemmatizer.load_lookups`](#load_lookups).
 | Name        | Type | Description                                       |
 | ----------- | ---- | ------------------------------------------------- |
 | `mode`      | str  | The lemmatizer mode.                              |
 | **RETURNS** | dict | The lookups configuration settings for this mode. |
 ## Lemmatizer.load_lookups {#load_lookups tag="classmethod"}
 Load and validate lookups tables. If the provided lookups is `None`, load the
 default lookups tables according to the language and mode settings. Confirm that
 all required tables for the language and mode are present.
 | Name        | Type                      | Description                                                                  |
 | ----------- | ------------------------- | ---------------------------------------------------------------------------- |
 | `lang`      | str                       | The language.                                                                |
 | `mode`      | str                       | The lemmatizer mode.                                                         |
 | `lookups`   | [`Lookups`](/api/lookups) | The provided lookups, may be `None` if the default lookups should be loaded. |
 | **RETURNS** | [`Lookups`](/api/lookups) | The lookups object.                                                          |
 ## Lemmatizer.to_disk {#to_disk tag="method"}
 Serialize the pipe to disk.
 > #### Example
 >
 > ```python
-> pos = "verb"
+> lemmatizer = nlp.add_pipe("lemmatizer")
-> morph = {"VerbForm": "inf"}
+> lemmatizer.to_disk("/path/to/lemmatizer")
 > is_base_form = lemmatizer.is_base_form(pos, morph)
 > assert is_base_form == True
 > ```
-| Name         | Type      | Description                                                                             |
+| Name           | Type            | Description                                                                                                           |
-| ------------ | --------- | --------------------------------------------------------------------------------------- |
+| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `univ_pos`   | str / int | The token's universal part-of-speech tag.                                               |
+| `path`         | str / `Path`    | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `morphology` | dict      | The token's morphological features.                                                     |
+| _keyword-only_ |                 |                                                                                                                       |
-| **RETURNS**  | bool      | Whether the token's part-of-speech tag and morphological features describe a base form. |
+| `exclude`      | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude.                                             |
 ## Lemmatizer.from_disk {#from_disk tag="method"}
 Load the pipe from disk. Modifies the object in place and returns it.
 > #### Example
 >
 > ```python
 > lemmatizer = nlp.add_pipe("lemmatizer")
 > lemmatizer.from_disk("/path/to/lemmatizer")
 > ```
 | Name           | Type            | Description                                                                |
 | -------------- | --------------- | -------------------------------------------------------------------------- |
 | `path`         | str / `Path`    | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | _keyword-only_ |                 |                                                                            |
 | `exclude`      | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude.  |
 | **RETURNS**    | `Lemmatizer`    | The modified `Lemmatizer` object.                                          |
 ## Lemmatizer.to_bytes {#to_bytes tag="method"}
 > #### Example
 >
 > ```python
 > lemmatizer = nlp.add_pipe("lemmatizer")
 > lemmatizer_bytes = lemmatizer.to_bytes()
 > ```
 Serialize the pipe to a bytestring.
 | Name           | Type            | Description                                                               |
 | -------------- | --------------- | ------------------------------------------------------------------------- |
 | _keyword-only_ |                 |                                                                           |
 | `exclude`      | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
 | **RETURNS**    | bytes           | The serialized form of the `Lemmatizer` object.                           |
 ## Lemmatizer.from_bytes {#from_bytes tag="method"}
 Load the pipe from a bytestring. Modifies the object in place and returns it.
 > #### Example
 >
 > ```python
 > lemmatizer_bytes = lemmatizer.to_bytes()
 > lemmatizer = nlp.add_pipe("lemmatizer")
 > lemmatizer.from_bytes(lemmatizer_bytes)
 > ```
 | Name           | Type            | Description                                                               |
 | -------------- | --------------- | ------------------------------------------------------------------------- |
 | `bytes_data`   | bytes           | The data to load from.                                                    |
 | _keyword-only_ |                 |                                                                           |
 | `exclude`      | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
 | **RETURNS**    | `Lemmatizer`    | The `Lemmatizer` object.                                                  |
 ## Lemmatizer.mode {#mode tag="property"}
 The lemmatizer mode.
 | Name        | Type  | Description          |
 | ----------- | ----- | -------------------- |
 | **RETURNS** | `str` | The lemmatizer mode. |
 ## Attributes {#attributes}
-| Name                                   | Type                      | Description                                                     |
+| Name      | Type                              | Description         |
-| -------------------------------------- | ------------------------- | --------------------------------------------------------------- |
+| --------- | --------------------------------- | ------------------- |
-| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the rules and data, if available. |
+| `vocab`   | The shared [`Vocab`](/api/vocab). |
 | `lookups` | [`Lookups`](/api/lookups)         | The lookups object. |
 ## Serialization fields {#serialization-fields}
 During serialization, spaCy will export several data fields used to restore
 different aspects of the object. If needed, you can exclude them from
 serialization by passing in the string names via the `exclude` argument.
 > #### Example
 >
 > ```python
 > data = lemmatizer.to_disk("/path", exclude=["vocab"])
 > ```
 | Name      | Description                                          |
 | --------- | ---------------------------------------------------- |
 | `vocab`   | The shared [`Vocab`](/api/vocab).                    |
 | `lookups` | The lookups. You usually don't want to exclude this. |
--- a/website/docs/api/morphology.md
+++ b/website/docs/api/morphology.md
@ -11,22 +11,19 @@ this class.
 ## Morphology.\_\_init\_\_ {#init tag="method"}
-Create a Morphology object using the tag map, lemmatizer and exceptions.
+Create a Morphology object.
 > #### Example
 >
 > ```python
 > from spacy.morphology import Morphology
 >
-> morphology = Morphology(strings, tag_map, lemmatizer)
+> morphology = Morphology(strings)
 > ```
-| Name         | Type              | Description                                                                                                |
+| Name      | Type          | Description       |
-| ------------ | ----------------- | ---------------------------------------------------------------------------------------------------------- |
+| --------- | ------------- | ----------------- |
-| `strings`    | `StringStore`     | The string store.                                                                                          |
+| `strings` | `StringStore` | The string store. |
 | `tag_map`    | `Dict[str, Dict]` | The tag map.                                                                                               |
 | `lemmatizer` | `Lemmatizer`      | The lemmatizer.                                                                                            |
 | `exc`        | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` |
 ## Morphology.add {#add tag="method"}
@ -62,52 +59,6 @@ Get the FEATS string for the hash of the morphological analysis.
 | ------- | ---- | --------------------------------------- |
 | `morph` | int  | The hash of the morphological analysis. |
 ## Morphology.load_tag_map {#load_tag_map tag="method"}
 Replace the current tag map with the provided tag map.
 | Name      | Type              | Description  |
 | --------- | ----------------- | ------------ |
 | `tag_map` | `Dict[str, Dict]` | The tag map. |
 ## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"}
 Replace the current morphological exceptions with the provided exceptions.
 | Name          | Type              | Description                   |
 | ------------- | ----------------- | ----------------------------- |
 | `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. |
 ## Morphology.add_special_case {#add_special_case tag="method"}
 Add a special-case rule to the morphological analyzer. Tokens whose tag and orth
 match the rule will receive the specified properties.
 > #### Example
 >
 > ```python
 > attrs = {"POS": "DET", "Definite": "Def"}
 > morphology.add_special_case("DT", "the", attrs)
 > ```
 | Name       | Type | Description                                    |
 | ---------- | ---- | ---------------------------------------------- |
 | `tag_str`  | str  | The fine-grained tag.                          |
 | `orth_str` | str  | The token text.                                |
 | `attrs`    | dict | The features to assign for this token and tag. |
 ## Morphology.exc {#exc tag="property"}
 The current morphological exceptions.
 | Name       | Type | Description                                         |
 | ---------- | ---- | --------------------------------------------------- |
 | **YIELDS** | dict | The current dictionary of morphological exceptions. |
 ## Morphology.lemmatize {#lemmatize tag="method"}
 TODO
 ## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"}
 Convert a string FEATS representation to a dictionary of features and values in
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@ -47,7 +47,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx
 >
 > # Construction via create_pipe with custom model
 > config = {"model": {"@architectures": "my_tagger"}}
-> parser = nlp.add_pipe("tagger", config=config)
+> tagger = nlp.add_pipe("tagger", config=config)
 >
 > # Construction from class
 > from spacy.pipeline import Tagger
@ -285,16 +285,14 @@ Add a new label to the pipe.
 > #### Example
 >
 > ```python
 > from spacy.symbols import POS
 > tagger = nlp.add_pipe("tagger")
-> tagger.add_label("MY_LABEL", {POS: "NOUN"})
+> tagger.add_label("MY_LABEL")
 > ```
-| Name        | Type             | Description                                                     |
+| Name        | Type | Description                                         |
-| ----------- | ---------------- | --------------------------------------------------------------- |
+| ----------- | ---- | --------------------------------------------------- |
-| `label`     | str              | The label to add.                                               |
+| `label`     | str  | The label to add.                                   |
-| `values`    | `Dict[int, str]` | Optional values to map to the label, e.g. a tag map dictionary. |
+| **RETURNS** | int  | `0` if the label is already present, otherwise `1`. |
 | **RETURNS** | int              | `0` if the label is already present, otherwise `1`.             |
 ## Tagger.to_disk {#to_disk tag="method"}
@ -369,9 +367,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
 ## Tagger.labels {#labels tag="property"}
-The labels currently added to the component. Note that even for a blank
+The labels currently added to the component.
 component, this will always include the built-in coarse-grained part-of-speech
 tags by default, e.g. `VERB`, `NOUN` and so on.
 > #### Example
 >
@ -396,9 +392,8 @@ serialization by passing in the string names via the `exclude` argument.
 > data = tagger.to_disk("/path", exclude=["vocab"])
 > ```
-| Name      | Description                                                                                |
+| Name    | Description                                                    |
-| --------- | ------------------------------------------------------------------------------------------ |
+| ------- | -------------------------------------------------------------- |
-| `vocab`   | The shared [`Vocab`](/api/vocab).                                                          |
+| `vocab` | The shared [`Vocab`](/api/vocab).                              |
-| `cfg`     | The config file. You usually don't want to exclude this.                                   |
+| `cfg`   | The config file. You usually don't want to exclude this.       |
-| `model`   | The binary model data. You usually don't want to exclude this.                             |
+| `model` | The binary model data. You usually don't want to exclude this. |
 | `tag_map` | The [tag map](/usage/adding-languages#tag-map) mapping fine-grained to coarse-grained tag. |
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@ -24,8 +24,6 @@ Create the vocabulary.
 | Name                                         | Type                 | Description                                                                                                                                                 |
 | -------------------------------------------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `lex_attr_getters`                           | dict                 | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`.                                                                        |
 | `tag_map`                                    | dict                 | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes.                                          |
 | `lemmatizer`                                 | object               | A lemmatizer. Defaults to `None`.                                                                                                                           |
 | `strings`                                    | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings.                                                 |
 | `lookups`                                    | `Lookups`            | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`.                                    |
 | `lookups_extra` <Tag variant="new">2.3</Tag> | `Lookups`            | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. |