Make lemmatizers use initialize logic (#6182)

* Make lemmatizer use initialize logic and tidy up * Fix typo * Raise for uninitialized tables
2025-07-15 10:42:34 +03:00 · 2020-10-02 15:42:36 +02:00 · 2020-10-02 15:42:36 +02:00 · f0b30aedad
commit f0b30aedad
parent df06f7a792
16 changed files with 236 additions and 281 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -477,6 +477,8 @@ class Errors:
    E201 = ("Span index out of range.")

    # TODO: fix numbering after merging develop into master
+    E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
+            "for mode '{mode}'. Required tables: {tables}. Found: {found}.")
    E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
            "config.cfg or override it on the CLI?")
    E914 = ("Executing {name} callback failed. Expected the function to "
@ -556,10 +558,10 @@ class Errors:
    E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
    E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
            "component.")
-    E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
-            "spacy-lookups-data. If you want to initialize a blank nlp object, "
-            "make sure you have the spacy-lookups-data package installed or "
-            "remove the [initialize.lookups] block from your config.")
+    E955 = ("Can't find table(s) {table} for language '{lang}' in "
+            "spacy-lookups-data. Make sure you have the package installed or "
+            "provide your own lookup tables if no default lookups are available "
+            "for your language.")
    E956 = ("Can't find component '{name}' in [components] block in the config. "
            "Available components: {opts}")
    E957 = ("Writing directly to Language.factories isn't needed anymore in "
@ -685,9 +687,8 @@ class Errors:
    E1002 = ("Span index out of range.")
    E1003 = ("Unsupported lemmatizer mode '{mode}'.")
    E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
-             "Required tables '{tables}', found '{found}'. If you are not "
-             "providing custom lookups, make sure you have the package "
-             "spacy-lookups-data installed.")
+             "Required tables: {tables}. Found: {found}. Maybe you forgot to "
+             "call nlp.initialize() to load in the data?")
    E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
             "'{chunk}'. Tokenizer exceptions are only allowed to specify "
             "`ORTH` and `NORM`.")
--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...lookups import Lookups
 from ...pipeline import Lemmatizer


@ -24,18 +23,11 @@ class Bengali(Language):
@Bengali.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return Lemmatizer(nlp.vocab, model, name, mode=mode)


 __all__ = ["Bengali"]
--- a/spacy/lang/el/init.py
+++ b/spacy/lang/el/init.py
@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .lemmatizer import GreekLemmatizer
-from ...lookups import Lookups
 from ...language import Language


@ -29,18 +28,11 @@ class Greek(Language):
@Greek.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return GreekLemmatizer(nlp.vocab, model, name, mode=mode)


 __all__ = ["Greek"]
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_INFIXES
 from .lemmatizer import EnglishLemmatizer
 from ...language import Language
-from ...lookups import Lookups


 class EnglishDefaults(Language.Defaults):
@ -27,18 +26,11 @@ class English(Language):
@English.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return EnglishLemmatizer(nlp.vocab, model, name, mode=mode)


 __all__ = ["English"]
--- a/spacy/lang/fa/init.py
+++ b/spacy/lang/fa/init.py
@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_SUFFIXES
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
-from ...lookups import Lookups
 from ...pipeline import Lemmatizer


@ -27,18 +26,11 @@ class Persian(Language):
@Persian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return Lemmatizer(nlp.vocab, model, name, mode=mode)


 __all__ = ["Persian"]
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .lemmatizer import FrenchLemmatizer
-from ...lookups import Lookups
 from ...language import Language


@ -32,18 +31,11 @@ class French(Language):
@French.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return FrenchLemmatizer(nlp.vocab, model, name, mode=mode)


 __all__ = ["French"]
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
-from ...lookups import Lookups
 from ...pipeline import Lemmatizer


@ -27,18 +26,11 @@ class Norwegian(Language):
@Norwegian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return Lemmatizer(nlp.vocab, model, name, mode=mode)


 __all__ = ["Norwegian"]
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -1,5 +1,4 @@
 from typing import Optional
-
 from thinc.api import Model

 from .stop_words import STOP_WORDS
@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .lemmatizer import DutchLemmatizer
-from ...lookups import Lookups
 from ...language import Language


@ -29,18 +27,11 @@ class Dutch(Language):
@Dutch.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return DutchLemmatizer(nlp.vocab, model, name, mode=mode)


 __all__ = ["Dutch"]
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -34,18 +34,11 @@ class Polish(Language):
@Polish.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pos_lookup", "lookups": None},
+    default_config={"model": None, "mode": "pos_lookup"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return PolishLemmatizer(nlp.vocab, model, name, mode=mode)


 __all__ = ["Polish"]
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import RussianLemmatizer
 from ...language import Language
-from ...lookups import Lookups


 class RussianDefaults(Language.Defaults):
@ -23,17 +22,11 @@ class Russian(Language):
@Russian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pymorphy2", "lookups": None},
+    default_config={"model": None, "mode": "pymorphy2"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return RussianLemmatizer(nlp.vocab, model, name, mode=mode)


 __all__ = ["Russian"]
--- a/spacy/lang/sv/init.py
+++ b/spacy/lang/sv/init.py
@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
-from ...lookups import Lookups
 from ...pipeline import Lemmatizer


@ -30,18 +29,11 @@ class Swedish(Language):
@Swedish.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return Lemmatizer(nlp.vocab, model, name, mode=mode)


 __all__ = ["Swedish"]
--- a/spacy/lang/uk/init.py
+++ b/spacy/lang/uk/init.py
@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import UkrainianLemmatizer
 from ...language import Language
-from ...lookups import Lookups


 class UkrainianDefaults(Language.Defaults):
@ -24,17 +23,11 @@ class Ukrainian(Language):
@Ukrainian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pymorphy2", "lookups": None},
+    default_config={"model": None, "mode": "pymorphy2"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)


 __all__ = ["Ukrainian"]
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -1,26 +1,25 @@
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union
+from typing import Tuple
 from thinc.api import Model
+from pathlib import Path

 from .pipe import Pipe
 from ..errors import Errors
 from ..language import Language
+from ..training import Example
 from ..lookups import Lookups, load_lookups
 from ..scorer import Scorer
 from ..tokens import Doc, Token
 from ..vocab import Vocab
 from ..training import validate_examples
+from ..util import logger, SimpleFrozenList
 from .. import util


@Language.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={
-        "model": None,
-        "mode": "lookup",
-        "lookups": None,
-        "overwrite": False,
-    },
+    default_config={"model": None, "mode": "lookup", "overwrite": False},
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
@ -28,13 +27,9 @@ def make_lemmatizer(
    model: Optional[Model],
    name: str,
    mode: str,
-    lookups: Optional[Lookups],
    overwrite: bool = False,
 ):
-    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return Lemmatizer(
-        nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
-    )
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)


 class Lemmatizer(Pipe):
@ -46,59 +41,19 @@ class Lemmatizer(Pipe):
    """

    @classmethod
-    def get_lookups_config(cls, mode: str) -> Dict:
+    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
        """Returns the lookups configuration settings for a given mode for use
        in Lemmatizer.load_lookups.

        mode (str): The lemmatizer mode.
-        RETURNS (dict): The lookups configuration settings for this mode.
-
-        DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
+        RETURNS (Tuple[List[str], List[str]]): The required and optional
+            lookup tables for this mode.
        """
        if mode == "lookup":
-            return {
-                "required_tables": ["lemma_lookup"],
-            }
+            return (["lemma_lookup"], [])
        elif mode == "rule":
-            return {
-                "required_tables": ["lemma_rules"],
-                "optional_tables": ["lemma_exc", "lemma_index"],
-            }
-        return {}
-
-    @classmethod
-    def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
-        """Load and validate lookups tables. If the provided lookups is None,
-        load the default lookups tables according to the language and mode
-        settings. Confirm that all required tables for the language and mode
-        are present.
-
-        lang (str): The language code.
-        mode (str): The lemmatizer mode.
-        lookups (Lookups): The provided lookups, may be None if the default
-            lookups should be loaded.
-        RETURNS (Lookups): The Lookups object.
-
-        DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
-        """
-        config = cls.get_lookups_config(mode)
-        required_tables = config.get("required_tables", [])
-        optional_tables = config.get("optional_tables", [])
-        if lookups is None:
-            lookups = load_lookups(lang=lang, tables=required_tables)
-            optional_lookups = load_lookups(
-                lang=lang, tables=optional_tables, strict=False
-            )
-            for table in optional_lookups.tables:
-                lookups.set_table(table, optional_lookups.get_table(table))
-        for table in required_tables:
-            if table not in lookups:
-                raise ValueError(
-                    Errors.E1004.format(
-                        mode=mode, tables=required_tables, found=lookups.tables
-                    )
-                )
-        return lookups
+            return (["lemma_rules"], ["lemma_exc", "lemma_index"])
+        return ([], [])

    def __init__(
        self,
@ -107,7 +62,6 @@ class Lemmatizer(Pipe):
        name: str = "lemmatizer",
        *,
        mode: str = "lookup",
-        lookups: Optional[Lookups] = None,
        overwrite: bool = False,
    ) -> None:
        """Initialize a Lemmatizer.
@ -116,9 +70,6 @@ class Lemmatizer(Pipe):
        model (Model): A model (not yet implemented).
        name (str): The component name. Defaults to "lemmatizer".
        mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
-        lookups (Lookups): The lookups object containing the (optional) tables
-            such as "lemma_rules", "lemma_index", "lemma_exc" and
-            "lemma_lookup". Defaults to None
        overwrite (bool): Whether to overwrite existing lemmas. Defaults to
            `False`.

@ -128,8 +79,9 @@ class Lemmatizer(Pipe):
        self.model = model
        self.name = name
        self._mode = mode
-        self.lookups = lookups if lookups is not None else Lookups()
+        self.lookups = Lookups()
        self.overwrite = overwrite
+        self._validated = False
        if self.mode == "lookup":
            self.lemmatize = self.lookup_lemmatize
        elif self.mode == "rule":
@ -153,12 +105,56 @@ class Lemmatizer(Pipe):

        DOCS: https://nightly.spacy.io/api/lemmatizer#call
        """
+        if not self._validated:
+            self._validate_tables(Errors.E1004)
        for token in doc:
            if self.overwrite or token.lemma == 0:
                token.lemma_ = self.lemmatize(token)[0]
        return doc

-    def pipe(self, stream, *, batch_size=128):
+    def initialize(
+        self,
+        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
+        *,
+        nlp: Optional[Language] = None,
+        lookups: Optional[Lookups] = None,
+    ):
+        """Initialize the lemmatizer and load in data.
+
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects.
+        nlp (Language): The current nlp object the component is part of.
+        lookups (Lookups): The lookups object containing the (optional) tables
+            such as "lemma_rules", "lemma_index", "lemma_exc" and
+            "lemma_lookup". Defaults to None.
+        """
+        required_tables, optional_tables = self.get_lookups_config(self.mode)
+        if lookups is None:
+            logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
+            lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
+            optional_lookups = load_lookups(
+                lang=self.vocab.lang, tables=optional_tables, strict=False
+            )
+            for table in optional_lookups.tables:
+                lookups.set_table(table, optional_lookups.get_table(table))
+        self.lookups = lookups
+        self._validate_tables(Errors.E1004)
+
+    def _validate_tables(self, error_message: str = Errors.E912) -> None:
+        """Check that the lookups are correct for the current mode."""
+        required_tables, optional_tables = self.get_lookups_config(self.mode)
+        for table in required_tables:
+            if table not in self.lookups:
+                raise ValueError(
+                    error_message.format(
+                        mode=self.mode,
+                        tables=required_tables,
+                        found=self.lookups.tables,
+                    )
+                )
+        self._validated = True
+
+    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
        """Apply the pipe to a stream of documents. This usually happens under
        the hood when the nlp object is called on a text and all components are
        applied to the Doc.
@ -263,7 +259,7 @@ class Lemmatizer(Pipe):
        """
        return False

-    def score(self, examples, **kwargs) -> Dict[str, Any]:
+    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
        """Score a batch of examples.

        examples (Iterable[Example]): The examples to score.
@ -274,58 +270,66 @@ class Lemmatizer(Pipe):
        validate_examples(examples, "Lemmatizer.score")
        return Scorer.score_token_attr(examples, "lemma", **kwargs)

-    def to_disk(self, path, *, exclude=tuple()):
-        """Save the current state to a directory.
+    def to_disk(
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
+    ):
+        """Serialize the pipe to disk.

-        path (unicode or Path): A path to a directory, which will be created if
-            it doesn't exist.
-        exclude (list): String names of serialization fields to exclude.
+        path (str / Path): Path to a directory.
+        exclude (Iterable[str]): String names of serialization fields to exclude.

-        DOCS: https://nightly.spacy.io/api/vocab#to_disk
+        DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
        """
        serialize = {}
        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
        serialize["lookups"] = lambda p: self.lookups.to_disk(p)
        util.to_disk(path, serialize, exclude)

-    def from_disk(self, path, *, exclude=tuple()):
-        """Loads state from a directory. Modifies the object in place and
-        returns it.
+    def from_disk(
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
+    ) -> "Lemmatizer":
+        """Load the pipe from disk. Modifies the object in place and returns it.

-        path (unicode or Path): A path to a directory.
-        exclude (list): String names of serialization fields to exclude.
-        RETURNS (Vocab): The modified `Vocab` object.
+        path (str / Path): Path to a directory.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (Lemmatizer): The modified Lemmatizer object.

-        DOCS: https://nightly.spacy.io/api/vocab#to_disk
+        DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
        """
        deserialize = {}
        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
        deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
        util.from_disk(path, deserialize, exclude)
+        self._validate_tables()
+        return self

-    def to_bytes(self, *, exclude=tuple()) -> bytes:
-        """Serialize the current state to a binary string.
+    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
+        """Serialize the pipe to a bytestring.

-        exclude (list): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized form of the `Vocab` object.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (bytes): The serialized object.

-        DOCS: https://nightly.spacy.io/api/vocab#to_bytes
+        DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
        """
        serialize = {}
        serialize["vocab"] = self.vocab.to_bytes
        serialize["lookups"] = self.lookups.to_bytes
        return util.to_bytes(serialize, exclude)

-    def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
-        """Load state from a binary string.
+    def from_bytes(
+        self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
+    ) -> "Lemmatizer":
+        """Load the pipe from a bytestring.

-        bytes_data (bytes): The data to load from.
-        exclude (list): String names of serialization fields to exclude.
-        RETURNS (Vocab): The `Vocab` object.
+        bytes_data (bytes): The serialized pipe.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (Lemmatizer): The loaded Lemmatizer.

-        DOCS: https://nightly.spacy.io/api/vocab#from_bytes
+        DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
        """
        deserialize = {}
        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
        deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
        util.from_bytes(bytes_data, deserialize, exclude)
+        self._validate_tables()
+        return self
--- a/spacy/tests/lang/test_lemmatizers.py
+++ b/spacy/tests/lang/test_lemmatizers.py
@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd):
    @registry.misc("lemmatizer_init_lookups")
    def lemmatizer_init_lookups():
        lookups = Lookups()
-        lookups.add_table("lemma_lookup", {"cope": "cope"})
+        lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"})
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
        return lookups

-    """Test that languages can be initialized."""
+    # Test that languages can be initialized
    nlp = get_lang_class(lang)()
-    nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
+    assert not lemmatizer.lookups.tables
+    nlp.config["initialize"]["components"]["lemmatizer"] = {
+        "lookups": {"@misc": "lemmatizer_init_lookups"}
+    }
+    with pytest.raises(ValueError):
+        nlp("x")
+    nlp.initialize()
+    assert lemmatizer.lookups.tables
+    doc = nlp("x")
    # Check for stray print statements (see #3342)
-    doc = nlp("test")  # noqa: F841
    captured = capfd.readouterr()
    assert not captured.out
+    assert doc[0].lemma_ == "y"
+
+    # Test initialization by calling .initialize() directly
+    nlp = get_lang_class(lang)()
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
+    lemmatizer.initialize(lookups=lemmatizer_init_lookups())
+    assert nlp("x")[0].lemma_ == "y"
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@ -8,61 +8,52 @@ from ..util import make_tempdir

@pytest.fixture
 def nlp():
-    return English()
-
-
-@pytest.fixture
-def lemmatizer(nlp):
    @registry.misc("cope_lookups")
    def cope_lookups():
        lookups = Lookups()
-        lookups.add_table("lemma_lookup", {"cope": "cope"})
+        lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
        return lookups

-    lemmatizer = nlp.add_pipe(
-        "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
-    )
-    return lemmatizer
+    nlp = English()
+    nlp.config["initialize"]["components"]["lemmatizer"] = {
+        "lookups": {"@misc": "cope_lookups"}
+    }
+    return nlp


 def test_lemmatizer_init(nlp):
-    @registry.misc("cope_lookups")
-    def cope_lookups():
-        lookups = Lookups()
-        lookups.add_table("lemma_lookup", {"cope": "cope"})
-        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
-        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
-        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
-        return lookups
-
-    lemmatizer = nlp.add_pipe(
-        "lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
-    )
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
    assert isinstance(lemmatizer.lookups, Lookups)
+    assert not lemmatizer.lookups.tables
    assert lemmatizer.mode == "lookup"
+    with pytest.raises(ValueError):
+        nlp("test")
+    nlp.initialize()
+    assert lemmatizer.lookups.tables
+    assert nlp("cope")[0].lemma_ == "cope"
+    assert nlp("coped")[0].lemma_ == "cope"
    # replace any tables from spacy-lookups-data
    lemmatizer.lookups = Lookups()
-    doc = nlp("coping")
    # lookup with no tables sets text as lemma
-    assert doc[0].lemma_ == "coping"
-
+    assert nlp("cope")[0].lemma_ == "cope"
+    assert nlp("coped")[0].lemma_ == "coped"
    nlp.remove_pipe("lemmatizer")
-
-    @registry.misc("empty_lookups")
-    def empty_lookups():
-        return Lookups()
-
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
    with pytest.raises(ValueError):
-        nlp.add_pipe(
-            "lemmatizer",
-            config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
-        )
+        # Can't initialize without required tables
+        lemmatizer.initialize(lookups=Lookups())
+    lookups = Lookups()
+    lookups.add_table("lemma_lookup", {})
+    lemmatizer.initialize(lookups=lookups)


-def test_lemmatizer_config(nlp, lemmatizer):
+def test_lemmatizer_config(nlp):
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
+    nlp.initialize()
+
    doc = nlp.make_doc("coping")
    doc[0].pos_ = "VERB"
    assert doc[0].lemma_ == ""
@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer):
    assert doc[0].lemma_ == "cope"


-def test_lemmatizer_serialize(nlp, lemmatizer):
-    @registry.misc("cope_lookups")
+def test_lemmatizer_serialize(nlp):
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
+    nlp.initialize()
+
    def cope_lookups():
        lookups = Lookups()
-        lookups.add_table("lemma_lookup", {"cope": "cope"})
+        lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
        return lookups

    nlp2 = English()
-    lemmatizer2 = nlp2.add_pipe(
-        "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
-    )
+    lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"})
+    lemmatizer2.initialize(lookups=cope_lookups())
    lemmatizer2.from_bytes(lemmatizer.to_bytes())
    assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
    assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@ -49,9 +49,8 @@ data format used by the lookup and rule-based lemmatizers, see
 > ```

 | Setting     | Description                                                                       |
-| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------- | --------------------------------------------------------------------------------- |
 | `mode`      | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
-| `lookups`   | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
 | `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~               |
 | `model`     | **Not yet implemented:** the model to use. ~~Model~~                              |

@ -77,13 +76,12 @@ shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).

 | Name           | Description                                                                                         |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| -------------- | --------------------------------------------------------------------------------------------------- |
 | `vocab`        | The shared vocabulary. ~~Vocab~~                                                                    |
 | `model`        | **Not yet implemented:** The model to use. ~~Model~~                                                |
 | `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
 | _keyword-only_ |                                                                                                     |
 | mode           | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~                   |
-| lookups        | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ |
 | overwrite      | Whether to overwrite existing lemmas. ~~bool~                                                       |

 ## Lemmatizer.\_\_call\_\_ {#call tag="method"}
@ -127,6 +125,37 @@ applied to the `Doc` in order.
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

+## Lemmatizer.initialize {#initialize tag="method"}
+
+Initialize the lemmatizer and load any data resources. This method is typically
+called by [`Language.initialize`](/api/language#initialize) and lets you
+customize arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config. The loading only happens during initialization, typically before
+training. At runtime, all data is loaded from disk.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("lemmatizer")
+> lemmatizer.initialize(lookups=lookups)
+> ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.lemmatizer]
+>
+> [initialize.components.lemmatizer.lookups]
+> @misc = "load_my_lookups.v1"
+> ```
+
+| Name           | Description                                                                                                                                                                                                                                                                         |
+| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Defaults to `None`. ~~Optional[Callable[[], Iterable[Example]]]~~                                                                                                                 |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                     |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                |
+| `lookups`      | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
+
 ## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}

 Lemmatize a token using a lookup-based approach. If no lemma is found, the