Make lemmatizers use initialize logic (#6182)

* Make lemmatizer use initialize logic and tidy up * Fix typo * Raise for uninitialized tables
2025-09-18 18:12:45 +03:00 · 2020-10-02 15:42:36 +02:00 · 2020-10-02 15:42:36 +02:00 · f0b30aedad
commit f0b30aedad
parent df06f7a792
16 changed files with 236 additions and 281 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -477,6 +477,8 @@ class Errors:
    E201 = ("Span index out of range.")
    # TODO: fix numbering after merging develop into master
    E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
            "for mode '{mode}'. Required tables: {tables}. Found: {found}.")
    E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
            "config.cfg or override it on the CLI?")
    E914 = ("Executing {name} callback failed. Expected the function to "
@ -556,10 +558,10 @@ class Errors:
    E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
    E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
            "component.")
-    E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
+    E955 = ("Can't find table(s) {table} for language '{lang}' in "
-            "spacy-lookups-data. If you want to initialize a blank nlp object, "
+            "spacy-lookups-data. Make sure you have the package installed or "
-            "make sure you have the spacy-lookups-data package installed or "
+            "provide your own lookup tables if no default lookups are available "
-            "remove the [initialize.lookups] block from your config.")
+            "for your language.")
    E956 = ("Can't find component '{name}' in [components] block in the config. "
            "Available components: {opts}")
    E957 = ("Writing directly to Language.factories isn't needed anymore in "
@ -685,9 +687,8 @@ class Errors:
    E1002 = ("Span index out of range.")
    E1003 = ("Unsupported lemmatizer mode '{mode}'.")
    E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
-             "Required tables '{tables}', found '{found}'. If you are not "
+             "Required tables: {tables}. Found: {found}. Maybe you forgot to "
-             "providing custom lookups, make sure you have the package "
+             "call nlp.initialize() to load in the data?")
             "spacy-lookups-data installed.")
    E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
             "'{chunk}'. Tokenizer exceptions are only allowed to specify "
             "`ORTH` and `NORM`.")
--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from ...language import Language
 from ...lookups import Lookups
 from ...pipeline import Lemmatizer
@ -24,18 +23,11 @@ class Bengali(Language):
@Bengali.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    nlp: Language,
+    return Lemmatizer(nlp.vocab, model, name, mode=mode)
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Bengali"]
--- a/spacy/lang/el/init.py
+++ b/spacy/lang/el/init.py
@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .lemmatizer import GreekLemmatizer
 from ...lookups import Lookups
 from ...language import Language
@ -29,18 +28,11 @@ class Greek(Language):
@Greek.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    nlp: Language,
+    return GreekLemmatizer(nlp.vocab, model, name, mode=mode)
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
    return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Greek"]
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_INFIXES
 from .lemmatizer import EnglishLemmatizer
 from ...language import Language
 from ...lookups import Lookups
 class EnglishDefaults(Language.Defaults):
@ -27,18 +26,11 @@ class English(Language):
@English.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    nlp: Language,
+    return EnglishLemmatizer(nlp.vocab, model, name, mode=mode)
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
    return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["English"]
--- a/spacy/lang/fa/init.py
+++ b/spacy/lang/fa/init.py
@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_SUFFIXES
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
 from ...lookups import Lookups
 from ...pipeline import Lemmatizer
@ -27,18 +26,11 @@ class Persian(Language):
@Persian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    nlp: Language,
+    return Lemmatizer(nlp.vocab, model, name, mode=mode)
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Persian"]
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .lemmatizer import FrenchLemmatizer
 from ...lookups import Lookups
 from ...language import Language
@ -32,18 +31,11 @@ class French(Language):
@French.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    nlp: Language,
+    return FrenchLemmatizer(nlp.vocab, model, name, mode=mode)
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
    return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["French"]
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
 from ...lookups import Lookups
 from ...pipeline import Lemmatizer
@ -27,18 +26,11 @@ class Norwegian(Language):
@Norwegian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    nlp: Language,
+    return Lemmatizer(nlp.vocab, model, name, mode=mode)
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Norwegian"]
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -1,5 +1,4 @@
 from typing import Optional
 from thinc.api import Model
 from .stop_words import STOP_WORDS
@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .lemmatizer import DutchLemmatizer
 from ...lookups import Lookups
 from ...language import Language
@ -29,18 +27,11 @@ class Dutch(Language):
@Dutch.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    nlp: Language,
+    return DutchLemmatizer(nlp.vocab, model, name, mode=mode)
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
    return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Dutch"]
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -34,18 +34,11 @@ class Polish(Language):
@Polish.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pos_lookup", "lookups": None},
+    default_config={"model": None, "mode": "pos_lookup"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    nlp: Language,
+    return PolishLemmatizer(nlp.vocab, model, name, mode=mode)
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
    return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Polish"]
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import RussianLemmatizer
 from ...language import Language
 from ...lookups import Lookups
 class RussianDefaults(Language.Defaults):
@ -23,17 +22,11 @@ class Russian(Language):
@Russian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pymorphy2", "lookups": None},
+    default_config={"model": None, "mode": "pymorphy2"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    nlp: Language,
+    return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Russian"]
--- a/spacy/lang/sv/init.py
+++ b/spacy/lang/sv/init.py
@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
 from ...lookups import Lookups
 from ...pipeline import Lemmatizer
@ -30,18 +29,11 @@ class Swedish(Language):
@Swedish.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    nlp: Language,
+    return Lemmatizer(nlp.vocab, model, name, mode=mode)
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Swedish"]
--- a/spacy/lang/uk/init.py
+++ b/spacy/lang/uk/init.py
@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import UkrainianLemmatizer
 from ...language import Language
 from ...lookups import Lookups
 class UkrainianDefaults(Language.Defaults):
@ -24,17 +23,11 @@ class Ukrainian(Language):
@Ukrainian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pymorphy2", "lookups": None},
+    default_config={"model": None, "mode": "pymorphy2"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    nlp: Language,
+    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
 ):
    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 __all__ = ["Ukrainian"]
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -1,26 +1,25 @@
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union
 from typing import Tuple
 from thinc.api import Model
 from pathlib import Path
 from .pipe import Pipe
 from ..errors import Errors
 from ..language import Language
 from ..training import Example
 from ..lookups import Lookups, load_lookups
 from ..scorer import Scorer
 from ..tokens import Doc, Token
 from ..vocab import Vocab
 from ..training import validate_examples
 from ..util import logger, SimpleFrozenList
 from .. import util
@Language.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={
+    default_config={"model": None, "mode": "lookup", "overwrite": False},
        "model": None,
        "mode": "lookup",
        "lookups": None,
        "overwrite": False,
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
@ -28,13 +27,9 @@ def make_lemmatizer(
    model: Optional[Model],
    name: str,
    mode: str,
    lookups: Optional[Lookups],
    overwrite: bool = False,
 ):
-    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
    return Lemmatizer(
        nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
    )
 class Lemmatizer(Pipe):
@ -46,59 +41,19 @@ class Lemmatizer(Pipe):
    """
    @classmethod
-    def get_lookups_config(cls, mode: str) -> Dict:
+    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
        """Returns the lookups configuration settings for a given mode for use
        in Lemmatizer.load_lookups.
        mode (str): The lemmatizer mode.
-        RETURNS (dict): The lookups configuration settings for this mode.
+        RETURNS (Tuple[List[str], List[str]]): The required and optional
-
+            lookup tables for this mode.
        DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
        """
        if mode == "lookup":
-            return {
+            return (["lemma_lookup"], [])
                "required_tables": ["lemma_lookup"],
            }
        elif mode == "rule":
-            return {
+            return (["lemma_rules"], ["lemma_exc", "lemma_index"])
-                "required_tables": ["lemma_rules"],
+        return ([], [])
                "optional_tables": ["lemma_exc", "lemma_index"],
            }
        return {}
    @classmethod
    def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
        """Load and validate lookups tables. If the provided lookups is None,
        load the default lookups tables according to the language and mode
        settings. Confirm that all required tables for the language and mode
        are present.
        lang (str): The language code.
        mode (str): The lemmatizer mode.
        lookups (Lookups): The provided lookups, may be None if the default
            lookups should be loaded.
        RETURNS (Lookups): The Lookups object.
        DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
        """
        config = cls.get_lookups_config(mode)
        required_tables = config.get("required_tables", [])
        optional_tables = config.get("optional_tables", [])
        if lookups is None:
            lookups = load_lookups(lang=lang, tables=required_tables)
            optional_lookups = load_lookups(
                lang=lang, tables=optional_tables, strict=False
            )
            for table in optional_lookups.tables:
                lookups.set_table(table, optional_lookups.get_table(table))
        for table in required_tables:
            if table not in lookups:
                raise ValueError(
                    Errors.E1004.format(
                        mode=mode, tables=required_tables, found=lookups.tables
                    )
                )
        return lookups
    def __init__(
        self,
@ -107,7 +62,6 @@ class Lemmatizer(Pipe):
        name: str = "lemmatizer",
        *,
        mode: str = "lookup",
        lookups: Optional[Lookups] = None,
        overwrite: bool = False,
    ) -> None:
        """Initialize a Lemmatizer.
@ -116,9 +70,6 @@ class Lemmatizer(Pipe):
        model (Model): A model (not yet implemented).
        name (str): The component name. Defaults to "lemmatizer".
        mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
        lookups (Lookups): The lookups object containing the (optional) tables
            such as "lemma_rules", "lemma_index", "lemma_exc" and
            "lemma_lookup". Defaults to None
        overwrite (bool): Whether to overwrite existing lemmas. Defaults to
            `False`.
@ -128,8 +79,9 @@ class Lemmatizer(Pipe):
        self.model = model
        self.name = name
        self._mode = mode
-        self.lookups = lookups if lookups is not None else Lookups()
+        self.lookups = Lookups()
        self.overwrite = overwrite
        self._validated = False
        if self.mode == "lookup":
            self.lemmatize = self.lookup_lemmatize
        elif self.mode == "rule":
@ -153,12 +105,56 @@ class Lemmatizer(Pipe):
        DOCS: https://nightly.spacy.io/api/lemmatizer#call
        """
        if not self._validated:
            self._validate_tables(Errors.E1004)
        for token in doc:
            if self.overwrite or token.lemma == 0:
                token.lemma_ = self.lemmatize(token)[0]
        return doc
-    def pipe(self, stream, *, batch_size=128):
+    def initialize(
        self,
        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
        *,
        nlp: Optional[Language] = None,
        lookups: Optional[Lookups] = None,
    ):
        """Initialize the lemmatizer and load in data.
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
        lookups (Lookups): The lookups object containing the (optional) tables
            such as "lemma_rules", "lemma_index", "lemma_exc" and
            "lemma_lookup". Defaults to None.
        """
        required_tables, optional_tables = self.get_lookups_config(self.mode)
        if lookups is None:
            logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
            lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
            optional_lookups = load_lookups(
                lang=self.vocab.lang, tables=optional_tables, strict=False
            )
            for table in optional_lookups.tables:
                lookups.set_table(table, optional_lookups.get_table(table))
        self.lookups = lookups
        self._validate_tables(Errors.E1004)
    def _validate_tables(self, error_message: str = Errors.E912) -> None:
        """Check that the lookups are correct for the current mode."""
        required_tables, optional_tables = self.get_lookups_config(self.mode)
        for table in required_tables:
            if table not in self.lookups:
                raise ValueError(
                    error_message.format(
                        mode=self.mode,
                        tables=required_tables,
                        found=self.lookups.tables,
                    )
                )
        self._validated = True
    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
        """Apply the pipe to a stream of documents. This usually happens under
        the hood when the nlp object is called on a text and all components are
        applied to the Doc.
@ -263,7 +259,7 @@ class Lemmatizer(Pipe):
        """
        return False
-    def score(self, examples, **kwargs) -> Dict[str, Any]:
+    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
@ -274,58 +270,66 @@ class Lemmatizer(Pipe):
        validate_examples(examples, "Lemmatizer.score")
        return Scorer.score_token_attr(examples, "lemma", **kwargs)
-    def to_disk(self, path, *, exclude=tuple()):
+    def to_disk(
-        """Save the current state to a directory.
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ):
        """Serialize the pipe to disk.
-        path (unicode or Path): A path to a directory, which will be created if
+        path (str / Path): Path to a directory.
-            it doesn't exist.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
        exclude (list): String names of serialization fields to exclude.
-        DOCS: https://nightly.spacy.io/api/vocab#to_disk
+        DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
        """
        serialize = {}
        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
        serialize["lookups"] = lambda p: self.lookups.to_disk(p)
        util.to_disk(path, serialize, exclude)
-    def from_disk(self, path, *, exclude=tuple()):
+    def from_disk(
-        """Loads state from a directory. Modifies the object in place and
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-        returns it.
+    ) -> "Lemmatizer":
        """Load the pipe from disk. Modifies the object in place and returns it.
-        path (unicode or Path): A path to a directory.
+        path (str / Path): Path to a directory.
-        exclude (list): String names of serialization fields to exclude.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Vocab): The modified `Vocab` object.
+        RETURNS (Lemmatizer): The modified Lemmatizer object.
-        DOCS: https://nightly.spacy.io/api/vocab#to_disk
+        DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
        """
        deserialize = {}
        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
        deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
        util.from_disk(path, deserialize, exclude)
        self._validate_tables()
        return self
-    def to_bytes(self, *, exclude=tuple()) -> bytes:
+    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
-        """Serialize the current state to a binary string.
+        """Serialize the pipe to a bytestring.
-        exclude (list): String names of serialization fields to exclude.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized form of the `Vocab` object.
+        RETURNS (bytes): The serialized object.
-        DOCS: https://nightly.spacy.io/api/vocab#to_bytes
+        DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
        """
        serialize = {}
        serialize["vocab"] = self.vocab.to_bytes
        serialize["lookups"] = self.lookups.to_bytes
        return util.to_bytes(serialize, exclude)
-    def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
+    def from_bytes(
-        """Load state from a binary string.
+        self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> "Lemmatizer":
        """Load the pipe from a bytestring.
-        bytes_data (bytes): The data to load from.
+        bytes_data (bytes): The serialized pipe.
-        exclude (list): String names of serialization fields to exclude.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Vocab): The `Vocab` object.
+        RETURNS (Lemmatizer): The loaded Lemmatizer.
-        DOCS: https://nightly.spacy.io/api/vocab#from_bytes
+        DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
        """
        deserialize = {}
        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
        deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
        util.from_bytes(bytes_data, deserialize, exclude)
        self._validate_tables()
        return self
--- a/spacy/tests/lang/test_lemmatizers.py
+++ b/spacy/tests/lang/test_lemmatizers.py
@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd):
    @registry.misc("lemmatizer_init_lookups")
    def lemmatizer_init_lookups():
        lookups = Lookups()
-        lookups.add_table("lemma_lookup", {"cope": "cope"})
+        lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"})
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
        return lookups
-    """Test that languages can be initialized."""
+    # Test that languages can be initialized
    nlp = get_lang_class(lang)()
-    nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
    assert not lemmatizer.lookups.tables
    nlp.config["initialize"]["components"]["lemmatizer"] = {
        "lookups": {"@misc": "lemmatizer_init_lookups"}
    }
    with pytest.raises(ValueError):
        nlp("x")
    nlp.initialize()
    assert lemmatizer.lookups.tables
    doc = nlp("x")
    # Check for stray print statements (see #3342)
    doc = nlp("test")  # noqa: F841
    captured = capfd.readouterr()
    assert not captured.out
    assert doc[0].lemma_ == "y"
    # Test initialization by calling .initialize() directly
    nlp = get_lang_class(lang)()
    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
    lemmatizer.initialize(lookups=lemmatizer_init_lookups())
    assert nlp("x")[0].lemma_ == "y"
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@ -8,61 +8,52 @@ from ..util import make_tempdir
@pytest.fixture
 def nlp():
    return English()
@pytest.fixture
 def lemmatizer(nlp):
    @registry.misc("cope_lookups")
    def cope_lookups():
        lookups = Lookups()
-        lookups.add_table("lemma_lookup", {"cope": "cope"})
+        lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
        return lookups
-    lemmatizer = nlp.add_pipe(
+    nlp = English()
-        "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
+    nlp.config["initialize"]["components"]["lemmatizer"] = {
-    )
+        "lookups": {"@misc": "cope_lookups"}
-    return lemmatizer
+    }
    return nlp
 def test_lemmatizer_init(nlp):
-    @registry.misc("cope_lookups")
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
    def cope_lookups():
        lookups = Lookups()
        lookups.add_table("lemma_lookup", {"cope": "cope"})
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
        return lookups
    lemmatizer = nlp.add_pipe(
        "lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
    )
    assert isinstance(lemmatizer.lookups, Lookups)
    assert not lemmatizer.lookups.tables
    assert lemmatizer.mode == "lookup"
    with pytest.raises(ValueError):
        nlp("test")
    nlp.initialize()
    assert lemmatizer.lookups.tables
    assert nlp("cope")[0].lemma_ == "cope"
    assert nlp("coped")[0].lemma_ == "cope"
    # replace any tables from spacy-lookups-data
    lemmatizer.lookups = Lookups()
    doc = nlp("coping")
    # lookup with no tables sets text as lemma
-    assert doc[0].lemma_ == "coping"
+    assert nlp("cope")[0].lemma_ == "cope"
-
+    assert nlp("coped")[0].lemma_ == "coped"
    nlp.remove_pipe("lemmatizer")
-
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
    @registry.misc("empty_lookups")
    def empty_lookups():
        return Lookups()
    with pytest.raises(ValueError):
-        nlp.add_pipe(
+        # Can't initialize without required tables
-            "lemmatizer",
+        lemmatizer.initialize(lookups=Lookups())
-            config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
+    lookups = Lookups()
-        )
+    lookups.add_table("lemma_lookup", {})
    lemmatizer.initialize(lookups=lookups)
-def test_lemmatizer_config(nlp, lemmatizer):
+def test_lemmatizer_config(nlp):
    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
    nlp.initialize()
    doc = nlp.make_doc("coping")
    doc[0].pos_ = "VERB"
    assert doc[0].lemma_ == ""
@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer):
    assert doc[0].lemma_ == "cope"
-def test_lemmatizer_serialize(nlp, lemmatizer):
+def test_lemmatizer_serialize(nlp):
-    @registry.misc("cope_lookups")
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
    nlp.initialize()
    def cope_lookups():
        lookups = Lookups()
-        lookups.add_table("lemma_lookup", {"cope": "cope"})
+        lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
        return lookups
    nlp2 = English()
-    lemmatizer2 = nlp2.add_pipe(
+    lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"})
-        "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
+    lemmatizer2.initialize(lookups=cope_lookups())
    )
    lemmatizer2.from_bytes(lemmatizer.to_bytes())
    assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
    assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
@ -100,9 +92,9 @@ def test_lemmatizer_serialize(nlp, lemmatizer):
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
-        doc2 = nlp2.make_doc("coping")
+    doc2 = nlp2.make_doc("coping")
-        doc2[0].pos_ = "VERB"
+    doc2[0].pos_ = "VERB"
-        assert doc2[0].lemma_ == ""
+    assert doc2[0].lemma_ == ""
-        doc2 = lemmatizer(doc2)
+    doc2 = lemmatizer(doc2)
-        assert doc2[0].text == "coping"
+    assert doc2[0].text == "coping"
-        assert doc2[0].lemma_ == "cope"
+    assert doc2[0].lemma_ == "cope"
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@ -48,12 +48,11 @@ data format used by the lookup and rule-based lemmatizers, see
 > nlp.add_pipe("lemmatizer", config=config)
 > ```
-| Setting     | Description                                                                                                                                                                                                                                                                         |
+| Setting     | Description                                                                       |
-| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------- | --------------------------------------------------------------------------------- |
-| `mode`      | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~                                                                                                                                                                                                   |
+| `mode`      | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
-| `lookups`   | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
+| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~               |
-| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                 |
+| `model`     | **Not yet implemented:** the model to use. ~~Model~~                              |
 | `model`     | **Not yet implemented:** the model to use. ~~Model~~                                                                                                                                                                                                                                |
 ```python
 %%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
@ -76,15 +75,14 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
-| Name           | Description                                                                                                                                                    |
+| Name           | Description                                                                                         |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| -------------- | --------------------------------------------------------------------------------------------------- |
-| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                                                               |
+| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                    |
-| `model`        | **Not yet implemented:** The model to use. ~~Model~~                                                                                                           |
+| `model`        | **Not yet implemented:** The model to use. ~~Model~~                                                |
-| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                            |
+| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
-| _keyword-only_ |                                                                                                                                                                |
+| _keyword-only_ |                                                                                                     |
-| mode           | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~                                                                              |
+| mode           | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~                   |
-| lookups        | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ |
+| overwrite      | Whether to overwrite existing lemmas. ~~bool~                                                       |
 | overwrite      | Whether to overwrite existing lemmas. ~~bool~                                                                                                                  |
 ## Lemmatizer.\_\_call\_\_ {#call tag="method"}
@ -127,6 +125,37 @@ applied to the `Doc` in order.
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 ## Lemmatizer.initialize {#initialize tag="method"}
 Initialize the lemmatizer and load any data resources. This method is typically
 called by [`Language.initialize`](/api/language#initialize) and lets you
 customize arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config. The loading only happens during initialization, typically before
 training. At runtime, all data is loaded from disk.
 > #### Example
 >
 > ```python
 > lemmatizer = nlp.add_pipe("lemmatizer")
 > lemmatizer.initialize(lookups=lookups)
 > ```
 >
 > ```ini
 > ### config.cfg
 > [initialize.components.lemmatizer]
 >
 > [initialize.components.lemmatizer.lookups]
 > @misc = "load_my_lookups.v1"
 > ```
 | Name           | Description                                                                                                                                                                                                                                                                         |
 | -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Defaults to `None`. ~~Optional[Callable[[], Iterable[Example]]]~~                                                                                                                 |
 | _keyword-only_ |                                                                                                                                                                                                                                                                                     |
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                |
 | `lookups`      | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
 ## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
 Lemmatize a token using a lookup-based approach. If no lemma is found, the