Refactor pipeline components, config and language data (#5759)

* Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2025-11-04 01:48:04 +03:00 · 2020-07-22 13:42:59 +02:00 · 2020-07-22 13:42:59 +02:00 · 43b960c01b
commit 43b960c01b
parent 311d0bde29
179 changed files with 6946 additions and 4619 deletions
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@ -17,7 +17,6 @@ import plac
 import random
 from pathlib import Path
 import spacy
-from spacy.kb import KnowledgeBase

 from spacy.gold import Example
 from spacy.pipeline import EntityRuler
@ -82,12 +81,16 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50):

    # Create the Entity Linker component and add it to the pipeline.
    if "entity_linker" not in nlp.pipe_names:
-        kb = KnowledgeBase(vocab=nlp.vocab)
-        kb.load_bulk(kb_path)
-        print("Loaded Knowledge Base from '%s'" % kb_path)
-
-        # use only the predicted EL score and not the prior probability (for demo purposes)
-        cfg = {"kb": kb, "incl_prior": False}
+        print("Loading Knowledge Base from '%s'" % kb_path)
+        cfg = {
+            "kb": {
+                "@assets": "spacy.KBFromFile.v1",
+                "vocab_path": vocab_path,
+                "kb_path": kb_path,
+            },
+            # use only the predicted EL score and not the prior probability (for demo purposes)
+            "incl_prior": False,
+        }
        entity_linker = nlp.create_pipe("entity_linker", cfg)
        nlp.add_pipe(entity_linker, last=True)

--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a18,<8.0.0a20",
+    "thinc>=8.0.0a19,<8.0.0a30",
    "blis>=0.4.0,<0.5.0",
    "pytokenizations"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -1,11 +1,11 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a18,<8.0.0a20
+thinc>=8.0.0a19,<8.0.0a30
 blis>=0.4.0,<0.5.0
 ml_datasets>=0.1.1
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.7.0,<1.1.0
+wasabi>=0.7.1,<1.1.0
 srsly>=2.1.0,<3.0.0
 catalogue>=0.0.7,<1.1.0
 typer>=0.3.0,<0.4.0
--- a/setup.cfg
+++ b/setup.cfg
@ -34,15 +34,15 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a18,<8.0.0a20
+    thinc>=8.0.0a19,<8.0.0a30
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a18,<8.0.0a20
+    thinc>=8.0.0a19,<8.0.0a30
    blis>=0.4.0,<0.5.0
-    wasabi>=0.7.0,<1.1.0
+    wasabi>=0.7.1,<1.1.0
    srsly>=2.1.0,<3.0.0
    catalogue>=0.0.7,<1.1.0
    typer>=0.3.0,<0.4.0
--- a/setup.py
+++ b/setup.py
@ -32,8 +32,14 @@ MOD_NAMES = [
    "spacy.attrs",
    "spacy.kb",
    "spacy.morphology",
-    "spacy.pipeline.pipes",
+    "spacy.pipeline.dep_parser",
    "spacy.pipeline.morphologizer",
+    "spacy.pipeline.multitask",
+    "spacy.pipeline.ner",
+    "spacy.pipeline.pipe",
+    "spacy.pipeline.sentencizer",
+    "spacy.pipeline.senter",
+    "spacy.pipeline.tagger",
    "spacy.syntax.stateclass",
    "spacy.syntax._state",
    "spacy.tokenizer",
--- a/spacy/init.py
+++ b/spacy/init.py
@ -14,7 +14,6 @@ from .about import __version__
 from .errors import Errors, Warnings
 from . import util
 from .util import registry
-from .language import component


 if sys.maxunicode == 65535:
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -63,7 +63,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
    result = {}
    while args:
        opt = args.pop(0)
-        err = f"Invalid config override '{opt}'"
+        err = f"Invalid CLI argument '{opt}'"
        if opt.startswith("--"):  # new argument
            opt = opt.replace("--", "").replace("-", "_")
            if "." not in opt:
@ -73,7 +73,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
            else:
                value = args.pop(0)
            # Just like we do in the config, we're calling json.loads on the
-            # values. But since they come from the CLI, it'd b unintuitive to
+            # values. But since they come from the CLI, it'd be unintuitive to
            # explicitly mark strings with escaped quotes. So we're working
            # around that here by falling back to a string if parsing fails.
            # TODO: improve logic to handle simple types like list of strings?
@ -82,7 +82,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
            except ValueError:
                result[opt] = str(value)
        else:
-            msg.fail(f"{err}: options need to start with --", exits=1)
+            msg.fail(f"{err}: override option should start with --", exits=1)
    return result


--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -3,12 +3,12 @@ from pathlib import Path
 from collections import Counter
 import sys
 import srsly
-from wasabi import Printer, MESSAGES, msg
+from wasabi import Printer, MESSAGES, msg, diff_strings
 import typer
+from thinc.api import Config

 from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
 from ._util import import_code, debug_cli
-from ..schemas import ConfigSchema
 from ..gold import Corpus, Example
 from ..syntax import nonproj
 from ..language import Language
@ -33,6 +33,9 @@ def debug_config_cli(
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
+    auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
+    diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
    # fmt: on
 ):
    """Debug a config.cfg file and show validation errors. The command will
@ -40,14 +43,37 @@ def debug_config_cli(
    validation errors are blocking and will prevent the rest of the config from
    being resolved. This means that you may not see all validation errors at
    once and some issues are only shown once previous errors have been fixed.
+    Similar as with the 'train' command, you can override settings from the config
+    as command line options. For instance, --training.batch_size 128 overrides
+    the value of "batch_size" in the block "[training]".
    """
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    with show_validation_error():
-        util.load_config(
-            config_path, create_objects=False, schema=ConfigSchema, overrides=overrides,
-        )
-    msg.good("Config is valid")
+        config = Config().from_disk(config_path)
+        try:
+            nlp, _ = util.load_model_from_config(
+                config, overrides=overrides, auto_fill=auto_fill
+            )
+        except ValueError as e:
+            msg.fail(str(e), exits=1)
+    is_stdout = output_path is not None and str(output_path) == "-"
+    if auto_fill:
+        orig_config = config.to_str()
+        filled_config = nlp.config.to_str()
+        if orig_config == filled_config:
+            msg.good("Original config is valid, no values were auto-filled")
+        else:
+            msg.good("Auto-filled config is valid")
+            if diff:
+                print(diff_strings(config.to_str(), nlp.config.to_str()))
+    else:
+        msg.good("Original config is valid", show=not is_stdout)
+    if is_stdout:
+        print(nlp.config.to_str())
+    elif output_path is not None:
+        nlp.config.to_disk(output_path)
+        msg.good(f"Saved updated config to {output_path}")


@debug_cli.command(
@ -117,16 +143,13 @@ def debug_data(
    if not config_path.exists():
        msg.fail("Config file not found", config_path, exists=1)
    with show_validation_error():
-        config = util.load_config(
-            config_path,
-            create_objects=False,
-            schema=ConfigSchema,
-            overrides=config_overrides,
-        )
-    nlp = util.load_model_from_config(config["nlp"])
+        cfg = Config().from_disk(config_path)
+        nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
+    # TODO: handle base model
    lang = config["nlp"]["lang"]
-    base_model = config["nlp"]["base_model"]
-    pipeline = list(config["nlp"]["pipeline"].keys())
+    base_model = config["training"]["base_model"]
+    pipeline = nlp.pipe_names
+    factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
    tag_map_path = util.ensure_path(config["training"]["tag_map"])
    tag_map = {}
    if tag_map_path is not None:
@ -164,19 +187,17 @@ def debug_data(
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_dataset constantly
-    gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
+    gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
    gold_train_unpreprocessed_data = _compile_gold(
-        train_dataset, pipeline, nlp, make_proj=False
+        train_dataset, factory_names, nlp, make_proj=False
    )
-    gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
+    gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True)

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]

    msg.divider("Training stats")
    msg.text(f"Training pipeline: {', '.join(pipeline)}")
-    for pipe in [p for p in pipeline if p not in nlp.factories]:
-        msg.fail(f"Pipeline component '{pipe}' not available in factories")
    if base_model:
        msg.text(f"Starting with base model '{base_model}'")
    else:
@ -244,7 +265,7 @@ def debug_data(
    else:
        msg.info("No word vectors present in the model")

-    if "ner" in pipeline:
+    if "ner" in factory_names:
        # Get all unique NER labels present in the data
        labels = set(
            label for label in gold_train_data["ner"] if label not in ("O", "-", None)
@ -332,7 +353,7 @@ def debug_data(
                "with punctuation can not be trained with a noise level > 0."
            )

-    if "textcat" in pipeline:
+    if "textcat" in factory_names:
        msg.divider("Text Classification")
        labels = [label for label in gold_train_data["cats"]]
        model_labels = _get_labels_from_model(nlp, "textcat")
@ -379,7 +400,7 @@ def debug_data(
                    "contains only instances with mutually-exclusive classes."
                )

-    if "tagger" in pipeline:
+    if "tagger" in factory_names:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_train_data["tags"]]
        tag_map = nlp.vocab.morphology.tag_map
@ -394,7 +415,7 @@ def debug_data(
        for label in non_tagmap:
            msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")

-    if "parser" in pipeline:
+    if "parser" in factory_names:
        has_low_data_warning = False
        msg.divider("Dependency Parsing")

@ -541,7 +562,10 @@ def _load_file(file_path: Path, msg: Printer) -> None:


 def _compile_gold(
-    examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool
+    examples: Sequence[Example],
+    factory_names: List[str],
+    nlp: Language,
+    make_proj: bool,
 ) -> Dict[str, Any]:
    data = {
        "ner": Counter(),
@ -573,7 +597,7 @@ def _compile_gold(
            for word in valid_words:
                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
                    data["words_missing_vectors"].update([word])
-        if "ner" in pipeline:
+        if "ner" in factory_names:
            for i, label in enumerate(eg.get_aligned_ner()):
                if label is None:
                    continue
@ -595,14 +619,14 @@ def _compile_gold(
                    data["ner"][combined_label] += 1
                elif label == "-":
                    data["ner"]["-"] += 1
-        if "textcat" in pipeline:
+        if "textcat" in factory_names:
            data["cats"].update(gold.cats)
            if list(gold.cats.values()).count(1.0) != 1:
                data["n_cats_multilabel"] += 1
-        if "tagger" in pipeline:
+        if "tagger" in factory_names:
            tags = eg.get_aligned("TAG", as_string=True)
            data["tags"].update([x for x in tags if x is not None])
-        if "parser" in pipeline:
+        if "parser" in factory_names:
            aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
            data["deps"].update([x for x in aligned_deps if x is not None])
            for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -1,8 +1,11 @@
+from typing import Dict, Any, Optional
 from pathlib import Path
 from wasabi import msg
-from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
+from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config
+from thinc.api import Model
+import typer

-from ._util import Arg, Opt, debug_cli
+from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
 from .. import util
 from ..lang.en import English

@ -10,8 +13,10 @@ from ..lang.en import English
@debug_cli.command("model")
 def debug_model_cli(
    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True),
-    layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"),
+    section: str = Arg(..., help="Section that defines the model to be analysed"),
+    layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
    dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
    parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
    gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
@ -20,14 +25,18 @@ def debug_model_cli(
    P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
    P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
    P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
-    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
-    seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
+    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
 ):
    """
    Analyze a Thinc model implementation. Includes checks for internal structure
    and activations during training.
    """
+    if use_gpu >= 0:
+        msg.info("Using GPU")
+        require_gpu(use_gpu)
+    else:
+        msg.info("Using CPU")
    print_settings = {
        "dimensions": dimensions,
        "parameters": parameters,
@ -39,27 +48,47 @@ def debug_model_cli(
        "print_after_training": P2,
        "print_prediction": P3,
    }
-
+    config_overrides = parse_config_overrides(ctx.args)
+    cfg = Config().from_disk(config_path)
+    with show_validation_error():
+        try:
+            _, config = util.load_model_from_config(cfg, overrides=config_overrides)
+        except ValueError as e:
+            msg.fail(str(e), exits=1)
+    seed = config["pretraining"]["seed"]
    if seed is not None:
        msg.info(f"Fixing random seed: {seed}")
        fix_random_seed(seed)
-    if use_gpu >= 0:
-        msg.info(f"Using GPU: {use_gpu}")
-        require_gpu(use_gpu)
+
+    component = config
+    parts = section.split(".")
+    for item in parts:
+        try:
+            component = component[item]
+        except KeyError:
+            msg.fail(
+                f"The section '{section}' is not a valid section in the provided config.",
+                exits=1,
+            )
+    if hasattr(component, "model"):
+        model = component.model
    else:
-        msg.info(f"Using CPU")
-
-    debug_model(
-        config_path, print_settings=print_settings,
-    )
+        msg.fail(
+            f"The section '{section}' does not specify an object that holds a Model.",
+            exits=1,
+        )
+    debug_model(model, print_settings=print_settings)


-def debug_model(config_path: Path, *, print_settings=None):
+def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
+    if not isinstance(model, Model):
+        msg.fail(
+            f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
+            exits=1,
+        )
    if print_settings is None:
        print_settings = {}

-    model = util.load_config(config_path, create_objects=True)["model"]
-
    # STEP 0: Printing before training
    msg.info(f"Analysing model with ID {model.id}")
    if print_settings.get("print_before_training"):
@ -67,7 +96,9 @@ def debug_model(config_path: Path, *, print_settings=None):
        _print_model(model, print_settings)

    # STEP 1: Initializing the model and printing again
-    model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp))
+    Y = _get_output(model.ops.xp)
+    _set_output_dim(nO=Y.shape[-1], model=model)
+    model.initialize(X=_get_docs(), Y=Y)
    if print_settings.get("print_after_init"):
        msg.info(f"After initialization:")
        _print_model(model, print_settings)
@ -110,12 +141,16 @@ def _get_docs():


 def _get_output(xp):
-    return xp.asarray(
-        [
-            xp.asarray([i + 10, i + 20, i + 30], dtype="float32")
-            for i, _ in enumerate(_get_docs())
-        ]
-    )
+    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
+
+
+def _set_output_dim(model, nO):
+    # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
+    if model.has_dim("nO") is None:
+        model.set_dim("nO", nO)
+    if model.has_ref("output_layer"):
+        if model.get_ref("output_layer").has_dim("nO") is None:
+            model.get_ref("output_layer").set_dim("nO", nO)


 def _print_model(model, print_settings):
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -105,9 +105,10 @@ def evaluate(
        print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)

    if displacy_path:
+        factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
        docs = [ex.predicted for ex in dev_dataset]
-        render_deps = "parser" in nlp.meta.get("pipeline", [])
-        render_ents = "ner" in nlp.meta.get("pipeline", [])
+        render_deps = "parser" in factory_names
+        render_ents = "ner" in factory_names
        render_parses(
            docs,
            displacy_path,
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -84,7 +84,6 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path)
    if model_path.resolve() != model_path:
-        meta["link"] = str(model_path)
        meta["source"] = str(model_path.resolve())
    else:
        meta["source"] = str(model_path)
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -125,7 +125,6 @@ def get_meta(
    meta.update(existing_meta)
    nlp = util.load_model_from_path(Path(model_path))
    meta["spacy_version"] = util.get_model_version_range(about.__version__)
-    meta["pipeline"] = nlp.pipe_names
    meta["vectors"] = {
        "width": nlp.vocab.vectors_length,
        "vectors": len(nlp.vocab.vectors),
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -5,7 +5,7 @@ import time
 import re
 from collections import Counter
 from pathlib import Path
-from thinc.api import use_pytorch_for_gpu_memory, require_gpu
+from thinc.api import use_pytorch_for_gpu_memory, require_gpu, Config
 from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
 from thinc.api import CosineDistance, L2Distance
 from wasabi import msg
@ -15,7 +15,6 @@ import typer

 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code
-from ..schemas import ConfigSchema
 from ..errors import Errors
 from ..ml.models.multi_task import build_cloze_multi_task_model
 from ..ml.models.multi_task import build_cloze_characters_multi_task_model
@ -37,6 +36,7 @@ def pretrain_cli(
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
+    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
    # fmt: on
 ):
    """
@ -67,6 +67,7 @@ def pretrain_cli(
        config_overrides=overrides,
        resume_path=resume_path,
        epoch_resume=epoch_resume,
+        use_gpu=use_gpu,
    )


@ -77,40 +78,29 @@ def pretrain(
    config_overrides: Dict[str, Any] = {},
    resume_path: Optional[Path] = None,
    epoch_resume: Optional[int] = None,
+    use_gpu: int = -1,
 ):
    verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
-    msg.info(f"Loading config from: {config_path}")
-    with show_validation_error():
-        config = util.load_config(
-            config_path,
-            create_objects=False,
-            validate=True,
-            schema=ConfigSchema,
-            overrides=config_overrides,
-        )
-    if not output_dir.exists():
-        output_dir.mkdir()
-        msg.good(f"Created output directory: {output_dir}")
-
-    use_gpu = config["training"]["use_gpu"]
    if use_gpu >= 0:
        msg.info("Using GPU")
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
-
+    msg.info(f"Loading config from: {config_path}")
+    config = Config().from_disk(config_path)
+    with show_validation_error():
+        nlp, config = util.load_model_from_config(config, overrides=config_overrides)
+    # TODO: validate that [pretraining] block exists
+    if not output_dir.exists():
+        output_dir.mkdir()
+        msg.good(f"Created output directory: {output_dir}")
    seed = config["pretraining"]["seed"]
    if seed is not None:
        fix_random_seed(seed)
    if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
        use_pytorch_for_gpu_memory()
-
-    nlp_config = config["nlp"]
-    srsly.write_json(output_dir / "config.json", config)
+    config.to_disk(output_dir / "config.cfg")
    msg.good("Saved config file in the output directory")
-
-    config = util.load_config(config_path, create_objects=True)
-    nlp = util.load_model_from_config(nlp_config)
    pretrain_config = config["pretraining"]

    if texts_loc != "-":  # reading from a file
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -25,7 +25,7 @@ def profile_cli(
    # fmt: on
 ):
    """
-    Profile a spaCy pipeline, to find out which functions take the most time.
+    Profile which functions take the most time in a spaCy pipeline.
    Input should be formatted as one JSON object per line with a key "text".
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,4 +1,4 @@
-from typing import Optional, Dict, Any
+from typing import Optional, Dict, Any, Tuple, Union, Callable, List
 from timeit import default_timer as timer
 import srsly
 import tqdm
@ -7,6 +7,7 @@ from wasabi import msg
 import thinc
 import thinc.schedules
 from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
+from thinc.api import Config, Optimizer
 import random
 import typer

@ -14,18 +15,15 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code
 from ..gold import Corpus, Example
 from ..lookups import Lookups
+from ..language import Language
 from .. import util
 from ..errors import Errors
-from ..schemas import ConfigSchema


 # Don't remove - required to load the built-in architectures
 from ..ml import models  # noqa: F401


-registry = util.registry
-
-
@app.command(
    "train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
 )
@ -38,6 +36,8 @@ def train_cli(
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
+    resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
    # fmt: on
 ):
    """
@ -53,9 +53,7 @@ def train_cli(
    referenced in the config.
    """
    util.set_env_log(verbose)
-    verify_cli_args(
-        train_path=train_path, dev_path=dev_path, config_path=config_path,
-    )
+    verify_cli_args(train_path, dev_path, config_path)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    train(
@ -63,6 +61,8 @@ def train_cli(
        {"train": train_path, "dev": dev_path},
        output_path=output_path,
        config_overrides=overrides,
+        use_gpu=use_gpu,
+        resume_training=resume,
    )


@ -72,63 +72,53 @@ def train(
    raw_text: Optional[Path] = None,
    output_path: Optional[Path] = None,
    config_overrides: Dict[str, Any] = {},
+    use_gpu: int = -1,
+    resume_training: bool = False,
 ) -> None:
-    msg.info(f"Loading config from: {config_path}")
-    # Read the config first without creating objects, to get to the original nlp_config
-    with show_validation_error():
-        config = util.load_config(
-            config_path,
-            create_objects=False,
-            schema=ConfigSchema,
-            overrides=config_overrides,
-        )
-    use_gpu = config["training"]["use_gpu"]
    if use_gpu >= 0:
        msg.info(f"Using GPU: {use_gpu}")
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
+    msg.info(f"Loading config and nlp from: {config_path}")
+    config = Config().from_disk(config_path)
+    with show_validation_error():
+        nlp, config = util.load_model_from_config(config, overrides=config_overrides)
+    if config["training"]["base_model"]:
+        base_nlp = util.load_model(config["training"]["base_model"])
+        # TODO: do something to check base_nlp against regular nlp described in config?
+        nlp = base_nlp
+    verify_config(nlp)
    raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
-    if config["training"].get("use_pytorch_for_gpu_memory"):
+    if config["training"]["use_pytorch_for_gpu_memory"]:
        # It feels kind of weird to not have a default for this.
        use_pytorch_for_gpu_memory()
-    nlp_config = config["nlp"]
-    config = util.load_config(
-        config_path,
-        create_objects=True,
-        schema=ConfigSchema,
-        overrides=config_overrides,
-    )
    training = config["training"]
-    msg.info("Creating nlp from config")
-    nlp = util.load_model_from_config(nlp_config)
    optimizer = training["optimizer"]
    limit = training["limit"]
    corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit)
-    if "textcat" in nlp_config["pipeline"]:
-        verify_textcat_config(nlp, nlp_config)
-    if training.get("resume", False):
+    if resume_training:
        msg.info("Resuming training")
        nlp.resume_training()
    else:
        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
-        train_examples = list(
-            corpus.train_dataset(
-                nlp,
-                shuffle=False,
-                gold_preproc=training["gold_preproc"],
-                max_length=training["max_length"],
-            )
+        train_examples = corpus.train_dataset(
+            nlp,
+            shuffle=False,
+            gold_preproc=training["gold_preproc"],
+            max_length=training["max_length"],
        )
+        train_examples = list(train_examples)
        nlp.begin_training(lambda: train_examples)

-    # Replace tag map with provided mapping
-    nlp.vocab.morphology.load_tag_map(tag_map)
-
-    # Load morph rules
-    nlp.vocab.morphology.load_morph_exceptions(morph_rules)
+    if tag_map:
+        # Replace tag map with provided mapping
+        nlp.vocab.morphology.load_tag_map(tag_map)
+    if morph_rules:
+        # Load morph rules
+        nlp.vocab.morphology.load_morph_exceptions(morph_rules)

    # Create empty extra lexeme tables so the data from spacy-lookups-data
    # isn't loaded if these features are accessed
@ -151,9 +141,8 @@ def train(
        for subpath in tok2vec_path.split("."):
            tok2vec = tok2vec.get(subpath)
        if not tok2vec:
-            msg.fail(
-                f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
-            )
+            err = f"Could not locate the tok2vec model at {tok2vec_path}"
+            msg.fail(err, exits=1)
        tok2vec.from_bytes(weights_data)

    msg.info("Loading training corpus")
@ -169,12 +158,11 @@ def train(
        evaluate,
        dropout=training["dropout"],
        accumulate_gradient=training["accumulate_gradient"],
-        patience=training.get("patience", 0),
-        max_steps=training.get("max_steps", 0),
+        patience=training["patience"],
+        max_steps=training["max_steps"],
        eval_frequency=training["eval_frequency"],
        raw_text=raw_text,
    )
-
    msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
    print_row = setup_printer(training, nlp)

@ -209,8 +197,10 @@ def train(
            msg.good(f"Saved model to output directory {final_model_path}")


-def create_train_batches(nlp, corpus, cfg):
-    max_epochs = cfg.get("max_epochs", 0)
+def create_train_batches(
+    nlp: Language, corpus: Corpus, cfg: Union[Config, Dict[str, Any]]
+):
+    max_epochs = cfg["max_epochs"]
    train_examples = list(
        corpus.train_dataset(
            nlp,
@ -219,9 +209,8 @@ def create_train_batches(nlp, corpus, cfg):
            max_length=cfg["max_length"],
        )
    )
-
    epoch = 0
-    batch_strategy = cfg.get("batch_by", "sequences")
+    batch_strategy = cfg["batch_by"]
    while True:
        if len(train_examples) == 0:
            raise ValueError(Errors.E988)
@ -241,7 +230,6 @@ def create_train_batches(nlp, corpus, cfg):
            )
        else:
            batches = util.minibatch(train_examples, size=cfg["batch_size"])
-
        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
        try:
            first = next(batches)
@ -255,18 +243,20 @@ def create_train_batches(nlp, corpus, cfg):
        random.shuffle(train_examples)


-def create_evaluation_callback(nlp, optimizer, corpus, cfg):
-    def evaluate():
-        dev_examples = list(
-            corpus.dev_dataset(
-                nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
-            )
+def create_evaluation_callback(
+    nlp: Language,
+    optimizer: Optimizer,
+    corpus: Corpus,
+    cfg: Union[Config, Dict[str, Any]],
+) -> Callable[[], Tuple[float, Dict[str, float]]]:
+    def evaluate() -> Tuple[float, Dict[str, float]]:
+        dev_examples = corpus.dev_dataset(
+            nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
        )
-
+        dev_examples = list(dev_examples)
        n_words = sum(len(ex.predicted) for ex in dev_examples)
-        batch_size = cfg.get("evaluation_batch_size", 128)
+        batch_size = cfg["eval_batch_size"]
        start_time = timer()
-
        if optimizer.averages:
            with nlp.use_params(optimizer.averages):
                scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
@ -280,12 +270,9 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
        try:
            weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
        except KeyError as e:
-            raise KeyError(
-                Errors.E983.format(
-                    dict="score_weights", key=str(e), keys=list(scores.keys())
-                )
-            )
-
+            keys = list(scores.keys())
+            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
+            raise KeyError(err)
        scores["speed"] = wps
        return weighted_score, scores

@ -293,17 +280,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):


 def train_while_improving(
-    nlp,
-    optimizer,
+    nlp: Language,
+    optimizer: Optimizer,
    train_data,
    evaluate,
    *,
-    dropout,
-    eval_frequency,
-    accumulate_gradient=1,
-    patience=0,
-    max_steps=0,
-    raw_text=None,
+    dropout: float,
+    eval_frequency: int,
+    accumulate_gradient: int,
+    patience: int,
+    max_steps: int,
+    raw_text: List[Dict[str, str]],
 ):
    """Train until an evaluation stops improving. Works as a generator,
    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@ -414,7 +401,9 @@ def subdivide_batch(batch, accumulate_gradient):
        yield subbatch


-def setup_printer(training, nlp):
+def setup_printer(
+    training: Union[Dict[str, Any], Config], nlp: Language
+) -> Callable[[Dict[str, Any]], None]:
    score_cols = training["scores"]
    score_widths = [max(len(col), 6) for col in score_cols]
    loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
@ -423,11 +412,10 @@ def setup_printer(training, nlp):
    table_header = [col.upper() for col in table_header]
    table_widths = [3, 6] + loss_widths + score_widths + [6]
    table_aligns = ["r" for _ in table_widths]
-
    msg.row(table_header, widths=table_widths)
    msg.row(["-" * width for width in table_widths])

-    def print_row(info):
+    def print_row(info: Dict[str, Any]) -> None:
        try:
            losses = [
                "{0:.2f}".format(float(info["losses"][pipe_name]))
@ -463,7 +451,9 @@ def setup_printer(training, nlp):
    return print_row


-def update_meta(training, nlp, info):
+def update_meta(
+    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
+) -> None:
    score_cols = training["scores"]
    nlp.meta["performance"] = {}
    for metric in score_cols:
@ -472,7 +462,9 @@ def update_meta(training, nlp, info):
        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]


-def load_from_paths(config):
+def load_from_paths(
+    config: Config,
+) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
    # TODO: separate checks from loading
    raw_text = util.ensure_path(config["training"]["raw_text"])
    if raw_text is not None:
@ -506,7 +498,7 @@ def verify_cli_args(
    dev_path: Path,
    config_path: Path,
    output_path: Optional[Path] = None,
-):
+) -> None:
    # Make sure all files and paths exists if they are needed
    if not config_path or not config_path.exists():
        msg.fail("Config file not found", config_path, exits=1)
@ -528,12 +520,23 @@ def verify_cli_args(
            )


-def verify_textcat_config(nlp, nlp_config):
+def verify_config(nlp: Language) -> None:
+    """Perform additional checks based on the config and loaded nlp object."""
+    # TODO: maybe we should validate based on the actual components, the list
+    # in config["nlp"]["pipeline"] instead?
+    for pipe_config in nlp.config["components"].values():
+        # We can't assume that the component name == the factory
+        factory = pipe_config["@factories"]
+        if factory == "textcat":
+            verify_textcat_config(nlp, pipe_config)
+
+
+def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
    # if 'positive_label' is provided: double check whether it's in the data and
    # the task is binary
-    if nlp_config["pipeline"]["textcat"].get("positive_label", None):
+    if pipe_config.get("positive_label"):
        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
-        pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
+        pos_label = pipe_config.get("positive_label")
        if pos_label not in textcat_labels:
            msg.fail(
                f"The textcat's 'positive_label' config setting '{pos_label}' "
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -0,0 +1,102 @@
+[nlp]
+lang = null
+stop_words = []
+lex_attr_getters = {}
+pipeline = []
+
+[nlp.tokenizer]
+@tokenizers = "spacy.Tokenizer.v1"
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.writing_system]
+direction = "ltr"
+has_case = true
+has_letters = true
+
+[components]
+
+# Training hyper-parameters and additional features.
+[training]
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length or number of examples.
+max_length = 5000
+limit = 0
+# Data augmentation
+orth_variant_level = 0.0
+dropout = 0.1
+# Controls early-stopping. 0 or -1 mean unlimited.
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+eval_batch_size = 128
+# Other settings
+seed = 0
+accumulate_gradient = 1
+use_pytorch_for_gpu_memory = false
+# Control how scores are printed and checkpoints are evaluated.
+scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
+score_weights = {"tags_acc": 0.2, "las": 0.4, "ents_f": 0.4}
+# These settings are invalid for the transformer models.
+init_tok2vec = null
+discard_oversize = false
+omit_extra_lookups = false
+batch_by = "sequences"
+raw_text = null
+tag_map = null
+morph_rules = null
+base_model = null
+vectors = null
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 1000
+stop = 1000
+compound = 1.001
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 1e-8
+
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 0.001
+
+[pretraining]
+max_epochs = 1000
+min_length = 5
+max_length = 500
+dropout = 0.2
+n_save_every = null
+batch_size = 3000
+seed = ${training:seed}
+use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
+tok2vec_model = "components.tok2vec.model"
+
+[pretraining.objective]
+type = "characters"
+n_characters = 4
+
+[pretraining.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 0.001
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -124,20 +124,24 @@ class Warnings:
@add_codes
 class Errors:
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
-    E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
-            "calls `nlp.create_pipe` with a component name that's not built "
-            "in - for example, when constructing the pipeline from a model's "
-            "meta.json. If you're using a custom component, you can write to "
-            "`Language.factories['{name}']` or remove it from the model meta "
-            "and add it via `nlp.add_pipe` instead.")
+    E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
+            "This usually happens when spaCy calls nlp.{method} with custom "
+            "component name that's not registered on the current language class. "
+            "If you're using a custom component, make sure you've added the "
+            "decorator @Language.component (for function components) or "
+            "@Language.factory (for class components).\n\nAvailable "
+            "factories: {opts}")
    E003 = ("Not a valid pipeline component. Expected callable, but "
-            "got {component} (name: '{name}').")
-    E004 = ("If you meant to add a built-in component, use `create_pipe`: "
-            "`nlp.add_pipe(nlp.create_pipe('{component}'))`")
+            "got {component} (name: '{name}'). If you're using a custom "
+            "component factory, double-check that it correctly returns your "
+            "initialized component.")
+    E004 = ("Can't set up pipeline component: a factory for '{name}' already exists.")
    E005 = ("Pipeline component '{name}' returned None. If you're using a "
            "custom component, maybe you forgot to return the processed Doc?")
-    E006 = ("Invalid constraints. You can only set one of the following: "
-            "before, after, first, last.")
+    E006 = ("Invalid constraints for adding pipeline component. You can only "
+            "set one of the following: before (component name or index), "
+            "after (component name or index), first (True) or last (True). "
+            "Invalid configuration: {args}. Existing components: {opts}")
    E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
    E008 = ("Some current components would be lost when restoring previous "
            "pipeline state. If you added components after calling "
@ -184,7 +188,7 @@ class Errors:
            "the documentation:\nhttps://spacy.io/usage/models")
    E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
            "component to the pipeline with: "
-            "nlp.add_pipe(nlp.create_pipe('sentencizer')). "
+            "nlp.add_pipe('sentencizer'). "
            "Alternatively, add the dependency parser, or set sentence "
            "boundaries by setting doc[i].is_sent_start.")
    E031 = ("Invalid token: empty string ('') at position {i}.")
@ -365,8 +369,6 @@ class Errors:
    E133 = ("The sum of prior probabilities for alias '{alias}' should not "
            "exceed 1, but found {sum}.")
    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
-    E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
-            "`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
    E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
            "to provide a valid JSON object as input with either the `text` "
            "or `tokens` key. For more info, see the docs:\n"
@ -484,6 +486,62 @@ class Errors:
    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")

    # TODO: fix numbering after merging develop into master
+    E956 = ("Can't find component '{name}' in [components] block in the config. "
+            "Available components: {opts}")
+    E957 = ("Writing directly to Language.factories isn't needed anymore in "
+            "spaCy v3. Instead, you can use the @Language.factory decorator "
+            "to register your custom component factory or @Language.component "
+            "to register a simple stateless function component that just takes "
+            "a Doc and returns it.")
+    E958 = ("Language code defined in config ({bad_lang_code}) does not match "
+            "language code of current Language subclass {lang} ({lang_code})")
+    E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
+    E960 = ("No config data found for component '{name}'. This is likely a bug "
+            "in spaCy.")
+    E961 = ("Found non-serializable Python object in config. Configs should "
+            "only include values that can be serialized to JSON. If you need "
+            "to pass models or other objects to your component, use a reference "
+            "to a registered function or initialize the object in your "
+            "component.\n\n{config}")
+    E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, "
+            "got: {cfg_type}.")
+    E963 = ("Can't read component info from @Language.{decorator} decorator. "
+            "Maybe you forgot to call it? Make sure you're using "
+            "@Language.{decorator}() instead of @Language.{decorator}.")
+    E964 = ("The pipeline component factory for '{name}' needs to have the "
+            "following named arguments, which are passed in by spaCy:\n- nlp: "
+            "receives the current nlp object and lets you access the vocab\n- "
+            "name: the name of the component instance, can be used to identify "
+            "the component, output losses etc.")
+    E965 = ("It looks like you're using the @Language.component decorator to "
+            "register '{name}' on a class instead of a function component. If "
+            "you need to register a class or function that *returns* a component "
+            "function, use the @Language.factory decorator instead.")
+    E966 = ("nlp.add_pipe now takes the string name of the registered component "
+            "factory, not a callable component. Expected string, but got "
+            "{component} (name: '{name}').\n\n- If you created your component "
+            "with nlp.create_pipe('name'): remove nlp.create_pipe and call "
+            "nlp.add_pipe('name') instead.\n\n- If you passed in a component "
+            "like TextCategorizer(): call nlp.add_pipe with the string name "
+            "instead, e.g. nlp.add_pipe('textcat').\n\n- If you're using a custom "
+            "component: Add the decorator @Language.component (for function "
+            "components) or @Language.factory (for class components / factories) "
+            "to your custom component and assign it a name, e.g. "
+            "@Language.component('your_name'). You can then run "
+            "nlp.add_pipe('your_name') to add it to the pipeline.")
+    E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.")
+    E968 = ("nlp.replace_pipe now takes the string name of the registered component "
+            "factory, not a callable component. Expected string, but got "
+            "{component}.\n\n- If you created your component with"
+            "with nlp.create_pipe('name'): remove nlp.create_pipe and call "
+            "nlp.replace_pipe('{name}', 'name') instead.\n\n- If you passed in a "
+            "component like TextCategorizer(): call nlp.replace_pipe with the "
+            "string name instead, e.g. nlp.replace_pipe('{name}', 'textcat').\n\n"
+            "- If you're using a custom component: Add the decorator "
+            "@Language.component (for function components) or @Language.factory "
+            "(for class components / factories) to your custom component and "
+            "assign it a name, e.g. @Language.component('your_name'). You can "
+            "then run nlp.replace_pipe('{name}', 'your_name').")
    E969 = ("Expected string values for field '{field}', but received {types} instead. ")
    E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
    E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
@ -506,10 +564,12 @@ class Errors:
            "into {values}, but found {value}.")
    E983 = ("Invalid key for '{dict}': {key}. Available keys: "
            "{keys}")
-    E985 = ("The pipeline component '{component}' is already available in the base "
-            "model. The settings in the component block in the config file are "
-            "being ignored. If you want to replace this component instead, set "
-            "'replace' to True in the training configuration.")
+    E984 = ("Invalid component config for '{name}': no @factories key "
+            "specifying the registered function used to initialize the "
+            "component. For example, @factories = \"ner\" will use the 'ner' "
+            "factory and all other settings in the block will be passed "
+            "to it as arguments.\n\n{config}")
+    E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}")
    E986 = ("Could not create any training batches: check your input. "
            "Perhaps discard_oversize should be set to False ?")
    E987 = ("The text of an example training instance is either a Doc or "
@ -530,9 +590,9 @@ class Errors:
    E992 = ("The function `select_pipes` was called with `enable`={enable} "
            "and `disable`={disable} but that information is conflicting "
            "for the `nlp` pipeline with components {names}.")
-    E993 = ("The config for 'nlp' should include either a key 'name' to "
-            "refer to an existing model by name or path, or a key 'lang' "
-            "to create a new blank model.")
+    E993 = ("The config for 'nlp' needs to include a key 'lang' specifying "
+            "the code of the language to initialize it with (for example "
+            "'en' for English).\n\n{config}")
    E996 = ("Could not parse {file}: {msg}")
    E997 = ("Tokenizer special cases are not allowed to modify the text. "
            "This would map '{chunk}' to '{orth}' given token attributes "
@ -540,9 +600,9 @@ class Errors:
    E999 = ("Unable to merge the `Doc` objects because they do not all share "
            "the same `Vocab`.")
    E1000 = ("No pkuseg model available. Provide a pkuseg model when "
-            "initializing the pipeline: "
-            '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
-            'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')
+             "initializing the pipeline:\n"
+             'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m'
+             'nlp = Chinese(config=cfg)')


@add_codes
--- a/spacy/gold/converters/conllu2docs.py
+++ b/spacy/gold/converters/conllu2docs.py
@ -1,10 +1,9 @@
 import re

 from .conll_ner2docs import n_sents_info
-from ...gold import Example
 from ...gold import iob_to_biluo, spans_from_biluo_tags
-from ...language import Language
 from ...tokens import Doc, Token, Span
+from ...vocab import Vocab
 from wasabi import Printer


@ -73,7 +72,7 @@ def read_conllx(
    ner_map=None,
 ):
    """ Yield docs, one for each sentence """
-    vocab = Language.Defaults.create_vocab()  # need vocab to make a minimal Doc
+    vocab = Vocab()  # need vocab to make a minimal Doc
    for sent in input_data.strip().split("\n\n"):
        lines = sent.strip().split("\n")
        if lines:
--- a/spacy/lang/af/init.py
+++ b/spacy/lang/af/init.py
@ -1,17 +1,26 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class AfrikaansDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "af"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "af"
+stop_words = {"@language_data": "spacy.af.stop_words"}
+"""
+
+
+@registry.language_data("spacy.af.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class Afrikaans(Language):
    lang = "af"
-    Defaults = AfrikaansDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Afrikaans"]
--- a/spacy/lang/ar/init.py
+++ b/spacy/lang/ar/init.py
@ -1,31 +1,48 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "ar"
+stop_words = {"@language_data": "spacy.ar.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
+
+[nlp.writing_system]
+direction = "rtl"
+has_case = false
+has_letters = true
+"""
+
+
+@registry.language_data("spacy.ar.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.ar.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class ArabicDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "ar"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    suffixes = TOKENIZER_SUFFIXES
-    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}


 class Arabic(Language):
    lang = "ar"
    Defaults = ArabicDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Arabic"]
--- a/spacy/lang/bg/init.py
+++ b/spacy/lang/bg/init.py
@ -1,17 +1,26 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class BulgarianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "bg"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "bg"
+stop_words = {"@language_data": "spacy.bg.stop_words"}
+"""
+
+
+@registry.language_data("spacy.bg.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class Bulgarian(Language):
    lang = "bg"
-    Defaults = BulgarianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Bulgarian"]
--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -1,18 +1,35 @@
+from typing import Set
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "bn"
+stop_words = {"@language_data": "spacy.bn.stop_words"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.bn.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class BengaliDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "bn"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
@ -21,6 +38,7 @@ class BengaliDefaults(Language.Defaults):
 class Bengali(Language):
    lang = "bn"
    Defaults = BengaliDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Bengali"]
--- a/spacy/lang/ca/init.py
+++ b/spacy/lang/ca/init.py
@ -1,31 +1,49 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
-
+from ...util import update_exc, registry
 from .punctuation import TOKENIZER_INFIXES


+DEFAULT_CONFIG = """
+[nlp]
+lang = "ca"
+stop_words = {"@language_data": "spacy.ca.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.ca.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.ca.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS
+
+
 class CatalanDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "ca"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
-    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    infixes = TOKENIZER_INFIXES


 class Catalan(Language):
    lang = "ca"
    Defaults = CatalanDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Catalan"]
--- a/spacy/lang/cs/init.py
+++ b/spacy/lang/cs/init.py
@ -1,17 +1,26 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class CzechDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "cs"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "cs"
+stop_words = {"@language_data": "spacy.cs.stop_words"}
+"""
+
+
+@registry.language_data("spacy.cs.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class Czech(Language):
    lang = "cs"
-    Defaults = CzechDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Czech"]
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -1,27 +1,50 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "da"
+stop_words = {"@language_data": "spacy.da.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.da.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.da.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class DanishDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "da"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
-    stop_words = STOP_WORDS


 class Danish(Language):
    lang = "da"
    Defaults = DanishDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Danish"]
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -1,23 +1,40 @@
+from typing import Set
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .punctuation import TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "de"
+stop_words = {"@language_data": "spacy.de.stop_words"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.de.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class GermanDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "de"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
-    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
    single_orth_variants = [
        {"tags": ["$("], "variants": ["…", "..."]},
@ -38,6 +55,7 @@ class GermanDefaults(Language.Defaults):
 class German(Language):
    lang = "de"
    Defaults = GermanDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["German"]
--- a/spacy/lang/el/init.py
+++ b/spacy/lang/el/init.py
@ -1,3 +1,6 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -6,32 +9,51 @@ from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...lookups import Lookups
-from ...attrs import LANG
-from ...util import update_exc
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "el"
+stop_words = {"@language_data": "spacy.el.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.GreekLemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.lemmatizers("spacy.GreekLemmatizer.v1")
+def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer:
+    return GreekLemmatizer(data_paths=data_paths)
+
+
+@registry.language_data("spacy.el.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.el.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class GreekDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "el"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
    syntax_iterators = SYNTAX_ITERATORS

-    @classmethod
-    def create_lemmatizer(cls, nlp=None, lookups=None):
-        if lookups is None:
-            lookups = Lookups()
-        return GreekLemmatizer(lookups)
-

 class Greek(Language):
    lang = "el"
    Defaults = GreekDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Greek"]
--- a/spacy/lang/el/lemmatizer.py
+++ b/spacy/lang/el/lemmatizer.py
@ -1,3 +1,5 @@
+from typing import Dict, List
+
 from ...lemmatizer import Lemmatizer


@ -12,7 +14,13 @@ class GreekLemmatizer(Lemmatizer):
    not applicable for Greek language.
    """

-    def lemmatize(self, string, index, exceptions, rules):
+    def lemmatize(
+        self,
+        string: str,
+        index: Dict[str, List[str]],
+        exceptions: Dict[str, Dict[str, List[str]]],
+        rules: Dict[str, List[List[str]]],
+    ) -> List[str]:
        string = string.lower()
        forms = []
        if string in index:
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -1,25 +1,50 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
-
+from .lemmatizer import is_base_form
 from .punctuation import TOKENIZER_INFIXES
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...lemmatizer import Lemmatizer
+from ...util import update_exc, registry


-def _return_en(_):
-    return "en"
+DEFAULT_CONFIG = """
+[nlp]
+lang = "en"
+stop_words = {"@language_data": "spacy.en.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.EnglishLemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.en.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.en.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS
+
+
+@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
+def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
+    return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form)


 class EnglishDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = _return_en
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
    infixes = TOKENIZER_INFIXES
    single_orth_variants = [
@ -31,45 +56,11 @@ class EnglishDefaults(Language.Defaults):
        {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
    ]

-    @classmethod
-    def is_base_form(cls, univ_pos, morphology=None):
-        """
-        Check whether we're dealing with an uninflected paradigm, so we can
-        avoid lemmatization entirely.
-
-        univ_pos (unicode / int): The token's universal part-of-speech tag.
-        morphology (dict): The token's morphological features following the
-            Universal Dependencies scheme.
-        """
-        if morphology is None:
-            morphology = {}
-        if univ_pos == "noun" and morphology.get("Number") == "sing":
-            return True
-        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
-            return True
-        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
-        # morphology
-        elif univ_pos == "verb" and (
-            morphology.get("VerbForm") == "fin"
-            and morphology.get("Tense") == "pres"
-            and morphology.get("Number") is None
-        ):
-            return True
-        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
-            return True
-        elif morphology.get("VerbForm") == "inf":
-            return True
-        elif morphology.get("VerbForm") == "none":
-            return True
-        elif morphology.get("Degree") == "pos":
-            return True
-        else:
-            return False
-

 class English(Language):
    lang = "en"
    Defaults = EnglishDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["English"]
--- a/spacy/lang/en/lemmatizer.py
+++ b/spacy/lang/en/lemmatizer.py
@ -0,0 +1,36 @@
+from typing import Optional
+
+
+def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
+    """
+    Check whether we're dealing with an uninflected paradigm, so we can
+    avoid lemmatization entirely.
+
+    univ_pos (unicode / int): The token's universal part-of-speech tag.
+    morphology (dict): The token's morphological features following the
+        Universal Dependencies scheme.
+    """
+    if morphology is None:
+        morphology = {}
+    if univ_pos == "noun" and morphology.get("Number") == "sing":
+        return True
+    elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
+        return True
+    # This maps 'VBP' to base form -- probably just need 'IS_BASE'
+    # morphology
+    elif univ_pos == "verb" and (
+        morphology.get("VerbForm") == "fin"
+        and morphology.get("Tense") == "pres"
+        and morphology.get("Number") is None
+    ):
+        return True
+    elif univ_pos == "adj" and morphology.get("Degree") == "pos":
+        return True
+    elif morphology.get("VerbForm") == "inf":
+        return True
+    elif morphology.get("VerbForm") == "none":
+        return True
+    elif morphology.get("Degree") == "pos":
+        return True
+    else:
+        return False
--- a/spacy/lang/en/lex_attrs.py
+++ b/spacy/lang/en/lex_attrs.py
@ -1,47 +1,17 @@
 from ...attrs import LIKE_NUM

-
+# fmt: off
 _num_words = [
-    "zero",
-    "one",
-    "two",
-    "three",
-    "four",
-    "five",
-    "six",
-    "seven",
-    "eight",
-    "nine",
-    "ten",
-    "eleven",
-    "twelve",
-    "thirteen",
-    "fourteen",
-    "fifteen",
-    "sixteen",
-    "seventeen",
-    "eighteen",
-    "nineteen",
-    "twenty",
-    "thirty",
-    "forty",
-    "fifty",
-    "sixty",
-    "seventy",
-    "eighty",
-    "ninety",
-    "hundred",
-    "thousand",
-    "million",
-    "billion",
-    "trillion",
-    "quadrillion",
-    "gajillion",
-    "bazillion",
+    "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
+    "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
+    "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
+    "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
+    "million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
 ]
+# fmt: on


-def like_num(text):
+def like_num(text: str) -> bool:
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
--- a/spacy/lang/es/init.py
+++ b/spacy/lang/es/init.py
@ -1,33 +1,52 @@
+from typing import Set, Dict, Callable, Any
+from thinc.config import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "es"
+stop_words = {"@language_data": "spacy.es.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.es.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.es.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class SpanishDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "es"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
-    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS


 class Spanish(Language):
    lang = "es"
    Defaults = SpanishDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Spanish"]
--- a/spacy/lang/et/init.py
+++ b/spacy/lang/et/init.py
@ -1,17 +1,26 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class EstonianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "et"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "et"
+stop_words = {"@language_data": "spacy.et.stop_words"}
+"""
+
+
+@registry.language_data("spacy.et.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class Estonian(Language):
    lang = "et"
-    Defaults = EstonianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Estonian"]
--- a/spacy/lang/eu/init.py
+++ b/spacy/lang/eu/init.py
@ -1,25 +1,41 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "eu"
+stop_words = {"@language_data": "spacy.eu.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
+"""
+
+
+@registry.language_data("spacy.eu.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.eu.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class BasqueDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "eu"
-
    tokenizer_exceptions = BASE_EXCEPTIONS
-    stop_words = STOP_WORDS
    suffixes = TOKENIZER_SUFFIXES


 class Basque(Language):
    lang = "eu"
    Defaults = BasqueDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Basque"]
--- a/spacy/lang/fa/init.py
+++ b/spacy/lang/fa/init.py
@ -1,7 +1,8 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
-from ..norm_exceptions import BASE_NORMS
+from ...util import update_exc, registry
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -9,23 +10,46 @@ from .punctuation import TOKENIZER_SUFFIXES
 from .syntax_iterators import SYNTAX_ITERATORS


+DEFAULT_CONFIG = """
+[nlp]
+lang = "fa"
+stop_words = {"@language_data": "spacy.fa.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
+
+[nlp.writing_system]
+direction = "rtl"
+has_case = false
+has_letters = true
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.fa.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.fa.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS
+
+
 class PersianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
-    lex_attr_getters[LANG] = lambda text: "fa"
    tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    suffixes = TOKENIZER_SUFFIXES
-    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
    syntax_iterators = SYNTAX_ITERATORS


 class Persian(Language):
    lang = "fa"
    Defaults = PersianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Persian"]
--- a/spacy/lang/fi/init.py
+++ b/spacy/lang/fi/init.py
@ -1,31 +1,43 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "fi"
+stop_words = {"@language_data": "spacy.fi.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
+"""
+
+
+@registry.language_data("spacy.fi.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.fi.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class FinnishDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "fi"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS


 class Finnish(Language):
    lang = "fi"
    Defaults = FinnishDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Finnish"]
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -1,44 +1,61 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-from .lemmatizer import FrenchLemmatizer
+from .lemmatizer import FrenchLemmatizer, is_base_form
 from .syntax_iterators import SYNTAX_ITERATORS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lookups import Lookups
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "fr"
+stop_words = {"@language_data": "spacy.fr.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.FrenchLemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
+def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer:
+    return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form)
+
+
+@registry.language_data("spacy.fr.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.fr.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class FrenchDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "fr"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    token_match = TOKEN_MATCH
    syntax_iterators = SYNTAX_ITERATORS

-    @classmethod
-    def create_lemmatizer(cls, nlp=None, lookups=None):
-        if lookups is None:
-            lookups = Lookups()
-        return FrenchLemmatizer(lookups)
-

 class French(Language):
    lang = "fr"
    Defaults = FrenchDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["French"]
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@ -1,3 +1,5 @@
+from typing import Optional, List, Dict
+
 from ...lemmatizer import Lemmatizer
 from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
 from ...symbols import SCONJ, CCONJ
@ -13,7 +15,9 @@ class FrenchLemmatizer(Lemmatizer):
    the lookup table.
    """

-    def __call__(self, string, univ_pos, morphology=None):
+    def __call__(
+        self, string: str, univ_pos: str, morphology: Optional[dict] = None
+    ) -> List[str]:
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        if "lemma_rules" not in self.lookups:
            return [lookup_table.get(string, string)]
@ -52,62 +56,19 @@ class FrenchLemmatizer(Lemmatizer):
        )
        return lemmas

-    def is_base_form(self, univ_pos, morphology=None):
-        """
-        Check whether we're dealing with an uninflected paradigm, so we can
-        avoid lemmatization entirely.
-        """
-        morphology = {} if morphology is None else morphology
-        others = [
-            key
-            for key in morphology
-            if key not in (POS, "Number", "POS", "VerbForm", "Tense")
-        ]
-        if univ_pos == "noun" and morphology.get("Number") == "sing":
-            return True
-        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
-            return True
-        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
-        # morphology
-        elif univ_pos == "verb" and (
-            morphology.get("VerbForm") == "fin"
-            and morphology.get("Tense") == "pres"
-            and morphology.get("Number") is None
-            and not others
-        ):
-            return True
-        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
-            return True
-        elif "VerbForm=inf" in morphology:
-            return True
-        elif "VerbForm=none" in morphology:
-            return True
-        elif "Number=sing" in morphology:
-            return True
-        elif "Degree=pos" in morphology:
-            return True
-        else:
-            return False
-
-    def noun(self, string, morphology=None):
-        return self(string, "noun", morphology)
-
-    def verb(self, string, morphology=None):
-        return self(string, "verb", morphology)
-
-    def adj(self, string, morphology=None):
-        return self(string, "adj", morphology)
-
-    def punct(self, string, morphology=None):
-        return self(string, "punct", morphology)
-
-    def lookup(self, string, orth=None):
+    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        if orth is not None and orth in lookup_table:
            return lookup_table[orth][0]
        return string

-    def lemmatize(self, string, index, exceptions, rules):
+    def lemmatize(
+        self,
+        string: str,
+        index: Dict[str, List[str]],
+        exceptions: Dict[str, Dict[str, List[str]]],
+        rules: Dict[str, List[List[str]]],
+    ) -> List[str]:
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        string = string.lower()
        forms = []
@ -133,3 +94,41 @@ class FrenchLemmatizer(Lemmatizer):
        if not forms:
            forms.append(string)
        return list(set(forms))
+
+
+def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
+    """
+    Check whether we're dealing with an uninflected paradigm, so we can
+    avoid lemmatization entirely.
+    """
+    morphology = {} if morphology is None else morphology
+    others = [
+        key
+        for key in morphology
+        if key not in (POS, "Number", "POS", "VerbForm", "Tense")
+    ]
+    if univ_pos == "noun" and morphology.get("Number") == "sing":
+        return True
+    elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
+        return True
+    # This maps 'VBP' to base form -- probably just need 'IS_BASE'
+    # morphology
+    elif univ_pos == "verb" and (
+        morphology.get("VerbForm") == "fin"
+        and morphology.get("Tense") == "pres"
+        and morphology.get("Number") is None
+        and not others
+    ):
+        return True
+    elif univ_pos == "adj" and morphology.get("Degree") == "pos":
+        return True
+    elif "VerbForm=inf" in morphology:
+        return True
+    elif "VerbForm=none" in morphology:
+        return True
+    elif "Number=sing" in morphology:
+        return True
+    elif "Degree=pos" in morphology:
+        return True
+    else:
+        return False
--- a/spacy/lang/ga/init.py
+++ b/spacy/lang/ga/init.py
@ -1,23 +1,33 @@
+from typing import Set
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "ga"
+stop_words = {"@language_data": "spacy.ga.stop_words"}
+"""
+
+
+@registry.language_data("spacy.ga.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class IrishDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "ga"
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)


 class Irish(Language):
    lang = "ga"
    Defaults = IrishDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Irish"]
--- a/spacy/lang/gu/init.py
+++ b/spacy/lang/gu/init.py
@ -1,15 +1,26 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
-
 from ...language import Language
+from ...util import registry


-class GujaratiDefaults(Language.Defaults):
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "gu"
+stop_words = {"@language_data": "spacy.gu.stop_words"}
+"""
+
+
+@registry.language_data("spacy.gu.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class Gujarati(Language):
    lang = "gu"
-    Defaults = GujaratiDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Gujarati"]
--- a/spacy/lang/he/init.py
+++ b/spacy/lang/he/init.py
@ -1,22 +1,37 @@
-from .stop_words import STOP_WORDS
+from typing import Set
+from thinc.api import Config

+from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "he"
+stop_words = {"@language_data": "spacy.he.stop_words"}
+
+[nlp.writing_system]
+direction = "rtl"
+has_case = false
+has_letters = true
+"""
+
+
+@registry.language_data("spacy.he.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class HebrewDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "he"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-    stop_words = STOP_WORDS
-    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}


 class Hebrew(Language):
    lang = "he"
    Defaults = HebrewDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Hebrew"]
--- a/spacy/lang/hi/init.py
+++ b/spacy/lang/hi/init.py
@ -1,20 +1,33 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class HindiDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "hi"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "hi"
+stop_words = {"@language_data": "spacy.hi.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
+"""
+
+
+@registry.language_data("spacy.hi.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.hi.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class Hindi(Language):
    lang = "hi"
-    Defaults = HindiDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Hindi"]
--- a/spacy/lang/hr/init.py
+++ b/spacy/lang/hr/init.py
@ -1,25 +1,39 @@
-from .stop_words import STOP_WORDS
+from typing import Set
+from thinc.api import Config

+from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "hr"
+stop_words = {"@language_data": "spacy.hr.stop_words"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.hr.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class CroatianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "hr"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-    stop_words = STOP_WORDS


 class Croatian(Language):
    lang = "hr"
    Defaults = CroatianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Croatian"]
--- a/spacy/lang/hu/init.py
+++ b/spacy/lang/hu/init.py
@ -1,22 +1,35 @@
+from typing import Set
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "hu"
+stop_words = {"@language_data": "spacy.hu.stop_words"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.hu.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class HungarianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "hu"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
@ -26,6 +39,7 @@ class HungarianDefaults(Language.Defaults):
 class Hungarian(Language):
    lang = "hu"
    Defaults = HungarianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Hungarian"]
--- a/spacy/lang/hy/init.py
+++ b/spacy/lang/hy/init.py
@ -1,21 +1,33 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-
-from ...attrs import LANG
 from ...language import Language
+from ...util import registry


-class ArmenianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "hy"
+DEFAULT_CONFIG = """
+[nlp]
+lang = "hy"
+stop_words = {"@language_data": "spacy.hy.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
+"""

-    lex_attr_getters.update(LEX_ATTRS)
-    stop_words = STOP_WORDS
+
+@registry.language_data("spacy.hy.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.hy.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class Armenian(Language):
    lang = "hy"
-    Defaults = ArmenianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Armenian"]
--- a/spacy/lang/id/init.py
+++ b/spacy/lang/id/init.py
@ -1,21 +1,43 @@
+from typing import Set, Dict, Callable, Any
+from thinc.config import Config
+
 from .stop_words import STOP_WORDS
 from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "id"
+stop_words = {"@language_data": "spacy.id.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.id.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.id.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class IndonesianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "id"
-    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
@ -25,6 +47,7 @@ class IndonesianDefaults(Language.Defaults):
 class Indonesian(Language):
    lang = "id"
    Defaults = IndonesianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Indonesian"]
--- a/spacy/lang/is/init.py
+++ b/spacy/lang/is/init.py
@ -1,17 +1,26 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class IcelandicDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "is"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "is"
+stop_words = {"@language_data": "spacy.is.stop_words"}
+"""
+
+
+@registry.language_data("spacy.is.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class Icelandic(Language):
    lang = "is"
-    Defaults = IcelandicDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Icelandic"]
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -1,20 +1,34 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "it"
+stop_words = {"@language_data": "spacy.it.stop_words"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.it.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class ItalianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "it"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
@ -24,6 +38,7 @@ class ItalianDefaults(Language.Defaults):
 class Italian(Language):
    lang = "it"
    Defaults = ItalianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Italian"]
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -1,21 +1,187 @@
+from typing import Optional, Union, Dict, Any, Set
+from pathlib import Path
 import srsly
-from collections import namedtuple, OrderedDict
+from collections import namedtuple
+from thinc.api import Config

 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .tag_map import TAG_MAP
 from .tag_orth_map import TAG_ORTH_MAP
 from .tag_bigram_map import TAG_BIGRAM_MAP
-from ...attrs import LANG
 from ...compat import copy_reg
 from ...errors import Errors
 from ...language import Language
 from ...symbols import POS
 from ...tokens import Doc
-from ...util import DummyTokenizer
+from ...util import DummyTokenizer, registry
 from ... import util


+DEFAULT_CONFIG = """
+[nlp]
+lang = "ja"
+stop_words = {"@language_data": "spacy.ja.stop_words"}
+
+[nlp.tokenizer]
+@tokenizers = "spacy.JapaneseTokenizer.v1"
+split_mode = null
+
+[nlp.writing_system]
+direction = "ltr"
+has_case = false
+has_letters = false
+"""
+
+
+@registry.language_data("spacy.ja.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.tokenizers("spacy.JapaneseTokenizer.v1")
+def create_japanese_tokenizer(split_mode: Optional[str] = None):
+    def japanese_tokenizer_factory(nlp):
+        return JapaneseTokenizer(nlp, split_mode=split_mode)
+
+    return japanese_tokenizer_factory
+
+
+class JapaneseTokenizer(DummyTokenizer):
+    def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
+        self.vocab = nlp.vocab
+        self.split_mode = split_mode
+        self.tokenizer = try_sudachi_import(self.split_mode)
+
+    def __call__(self, text: str) -> Doc:
+        # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
+        sudachipy_tokens = self.tokenizer.tokenize(text)
+        dtokens = self._get_dtokens(sudachipy_tokens)
+        dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
+
+        # create Doc with tag bi-gram based part-of-speech identification rules
+        words, tags, inflections, lemmas, readings, sub_tokens_list = (
+            zip(*dtokens) if dtokens else [[]] * 6
+        )
+        sub_tokens_list = list(sub_tokens_list)
+        doc = Doc(self.vocab, words=words, spaces=spaces)
+        next_pos = None  # for bi-gram rules
+        for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
+            token.tag_ = dtoken.tag
+            if next_pos:  # already identified in previous iteration
+                token.pos = next_pos
+                next_pos = None
+            else:
+                token.pos, next_pos = resolve_pos(
+                    token.orth_,
+                    dtoken.tag,
+                    tags[idx + 1] if idx + 1 < len(tags) else None,
+                )
+            # if there's no lemma info (it's an unk) just use the surface
+            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
+        doc.user_data["inflections"] = inflections
+        doc.user_data["reading_forms"] = readings
+        doc.user_data["sub_tokens"] = sub_tokens_list
+        return doc
+
+    def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
+        sub_tokens_list = (
+            self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
+        )
+        dtokens = [
+            DetailedToken(
+                token.surface(),  # orth
+                "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
+                ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
+                token.dictionary_form(),  # lemma
+                token.reading_form(),  # user_data['reading_forms']
+                sub_tokens_list[idx]
+                if sub_tokens_list
+                else None,  # user_data['sub_tokens']
+            )
+            for idx, token in enumerate(sudachipy_tokens)
+            if len(token.surface()) > 0
+            # remove empty tokens which can be produced with characters like … that
+        ]
+        # Sudachi normalizes internally and outputs each space char as a token.
+        # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
+        return [
+            t
+            for idx, t in enumerate(dtokens)
+            if idx == 0
+            or not t.surface.isspace()
+            or t.tag != "空白"
+            or not dtokens[idx - 1].surface.isspace()
+            or dtokens[idx - 1].tag != "空白"
+        ]
+
+    def _get_sub_tokens(self, sudachipy_tokens):
+        if (
+            self.split_mode is None or self.split_mode == "A"
+        ):  # do nothing for default split mode
+            return None
+
+        sub_tokens_list = []  # list of (list of list of DetailedToken | None)
+        for token in sudachipy_tokens:
+            sub_a = token.split(self.tokenizer.SplitMode.A)
+            if len(sub_a) == 1:  # no sub tokens
+                sub_tokens_list.append(None)
+            elif self.split_mode == "B":
+                sub_tokens_list.append([self._get_dtokens(sub_a, False)])
+            else:  # "C"
+                sub_b = token.split(self.tokenizer.SplitMode.B)
+                if len(sub_a) == len(sub_b):
+                    dtokens = self._get_dtokens(sub_a, False)
+                    sub_tokens_list.append([dtokens, dtokens])
+                else:
+                    sub_tokens_list.append(
+                        [
+                            self._get_dtokens(sub_a, False),
+                            self._get_dtokens(sub_b, False),
+                        ]
+                    )
+        return sub_tokens_list
+
+    def _get_config(self) -> Dict[str, Any]:
+        return {"split_mode": self.split_mode}
+
+    def _set_config(self, config: Dict[str, Any] = {}) -> None:
+        self.split_mode = config.get("split_mode", None)
+
+    def to_bytes(self, **kwargs) -> bytes:
+        serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
+        return util.to_bytes(serializers, [])
+
+    def from_bytes(self, data: bytes, **kwargs) -> "JapaneseTokenizer":
+        deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
+        util.from_bytes(data, deserializers, [])
+        self.tokenizer = try_sudachi_import(self.split_mode)
+        return self
+
+    def to_disk(self, path: Union[str, Path], **kwargs) -> None:
+        path = util.ensure_path(path)
+        serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
+        return util.to_disk(path, serializers, [])
+
+    def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer":
+        path = util.ensure_path(path)
+        serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
+        util.from_disk(path, serializers, [])
+        self.tokenizer = try_sudachi_import(self.split_mode)
+        return self
+
+
+class JapaneseDefaults(Language.Defaults):
+    tag_map = TAG_MAP
+    syntax_iterators = SYNTAX_ITERATORS
+
+
+class Japanese(Language):
+    lang = "ja"
+    Defaults = JapaneseDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
+
+
 # Hold the attributes we need with convenient names
 DetailedToken = namedtuple(
    "DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
@ -133,161 +299,6 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
    return text_dtokens, text_spaces


-class JapaneseTokenizer(DummyTokenizer):
-    def __init__(self, cls, nlp=None, config={}):
-        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
-        self.split_mode = config.get("split_mode", None)
-        self.tokenizer = try_sudachi_import(self.split_mode)
-
-    def __call__(self, text):
-        # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
-        sudachipy_tokens = self.tokenizer.tokenize(text)
-        dtokens = self._get_dtokens(sudachipy_tokens)
-        dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
-
-        # create Doc with tag bi-gram based part-of-speech identification rules
-        words, tags, inflections, lemmas, readings, sub_tokens_list = (
-            zip(*dtokens) if dtokens else [[]] * 6
-        )
-        sub_tokens_list = list(sub_tokens_list)
-        doc = Doc(self.vocab, words=words, spaces=spaces)
-        next_pos = None  # for bi-gram rules
-        for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
-            token.tag_ = dtoken.tag
-            if next_pos:  # already identified in previous iteration
-                token.pos = next_pos
-                next_pos = None
-            else:
-                token.pos, next_pos = resolve_pos(
-                    token.orth_,
-                    dtoken.tag,
-                    tags[idx + 1] if idx + 1 < len(tags) else None,
-                )
-            # if there's no lemma info (it's an unk) just use the surface
-            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
-
-        doc.user_data["inflections"] = inflections
-        doc.user_data["reading_forms"] = readings
-        doc.user_data["sub_tokens"] = sub_tokens_list
-
-        return doc
-
-    def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
-        sub_tokens_list = (
-            self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
-        )
-        dtokens = [
-            DetailedToken(
-                token.surface(),  # orth
-                "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
-                ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
-                token.dictionary_form(),  # lemma
-                token.reading_form(),  # user_data['reading_forms']
-                sub_tokens_list[idx]
-                if sub_tokens_list
-                else None,  # user_data['sub_tokens']
-            )
-            for idx, token in enumerate(sudachipy_tokens)
-            if len(token.surface()) > 0
-            # remove empty tokens which can be produced with characters like … that
-        ]
-        # Sudachi normalizes internally and outputs each space char as a token.
-        # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
-        return [
-            t
-            for idx, t in enumerate(dtokens)
-            if idx == 0
-            or not t.surface.isspace()
-            or t.tag != "空白"
-            or not dtokens[idx - 1].surface.isspace()
-            or dtokens[idx - 1].tag != "空白"
-        ]
-
-    def _get_sub_tokens(self, sudachipy_tokens):
-        if (
-            self.split_mode is None or self.split_mode == "A"
-        ):  # do nothing for default split mode
-            return None
-
-        sub_tokens_list = []  # list of (list of list of DetailedToken | None)
-        for token in sudachipy_tokens:
-            sub_a = token.split(self.tokenizer.SplitMode.A)
-            if len(sub_a) == 1:  # no sub tokens
-                sub_tokens_list.append(None)
-            elif self.split_mode == "B":
-                sub_tokens_list.append([self._get_dtokens(sub_a, False)])
-            else:  # "C"
-                sub_b = token.split(self.tokenizer.SplitMode.B)
-                if len(sub_a) == len(sub_b):
-                    dtokens = self._get_dtokens(sub_a, False)
-                    sub_tokens_list.append([dtokens, dtokens])
-                else:
-                    sub_tokens_list.append(
-                        [
-                            self._get_dtokens(sub_a, False),
-                            self._get_dtokens(sub_b, False),
-                        ]
-                    )
-        return sub_tokens_list
-
-    def _get_config(self):
-        config = OrderedDict((("split_mode", self.split_mode),))
-        return config
-
-    def _set_config(self, config={}):
-        self.split_mode = config.get("split_mode", None)
-
-    def to_bytes(self, **kwargs):
-        serializers = OrderedDict(
-            (("cfg", lambda: srsly.json_dumps(self._get_config())),)
-        )
-        return util.to_bytes(serializers, [])
-
-    def from_bytes(self, data, **kwargs):
-        deserializers = OrderedDict(
-            (("cfg", lambda b: self._set_config(srsly.json_loads(b))),)
-        )
-        util.from_bytes(data, deserializers, [])
-        self.tokenizer = try_sudachi_import(self.split_mode)
-        return self
-
-    def to_disk(self, path, **kwargs):
-        path = util.ensure_path(path)
-        serializers = OrderedDict(
-            (("cfg", lambda p: srsly.write_json(p, self._get_config())),)
-        )
-        return util.to_disk(path, serializers, [])
-
-    def from_disk(self, path, **kwargs):
-        path = util.ensure_path(path)
-        serializers = OrderedDict(
-            (("cfg", lambda p: self._set_config(srsly.read_json(p))),)
-        )
-        util.from_disk(path, serializers, [])
-        self.tokenizer = try_sudachi_import(self.split_mode)
-
-
-class JapaneseDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda _text: "ja"
-    stop_words = STOP_WORDS
-    tag_map = TAG_MAP
-    syntax_iterators = SYNTAX_ITERATORS
-    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
-
-    @classmethod
-    def create_tokenizer(cls, nlp=None, config={}):
-        return JapaneseTokenizer(cls, nlp, config)
-
-
-class Japanese(Language):
-    lang = "ja"
-    Defaults = JapaneseDefaults
-
-    def make_doc(self, text):
-        return self.tokenizer(text)
-
-
 def pickle_japanese(instance):
    return Japanese, tuple()

--- a/spacy/lang/kn/init.py
+++ b/spacy/lang/kn/init.py
@ -1,17 +1,26 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class KannadaDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "kn"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "kn"
+stop_words = {"@language_data": "spacy.kn.stop_words"}
+"""
+
+
+@registry.language_data("spacy.kn.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class Kannada(Language):
    lang = "kn"
-    Defaults = KannadaDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Kannada"]
--- a/spacy/lang/ko/init.py
+++ b/spacy/lang/ko/init.py
@ -1,51 +1,52 @@
+from typing import Set, Optional, Any, Dict
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .tag_map import TAG_MAP
-from ...attrs import LANG
 from ...language import Language
 from ...tokens import Doc
 from ...compat import copy_reg
-from ...util import DummyTokenizer
+from ...util import DummyTokenizer, registry


-def try_mecab_import():
-    try:
-        from natto import MeCab
+DEFAULT_CONFIG = """
+[nlp]
+lang = "ko"
+stop_words = {"@language_data": "spacy.ko.stop_words"}

-        return MeCab
-    except ImportError:
-        raise ImportError(
-            "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
-            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
-            "and [natto-py](https://github.com/buruzaemon/natto-py)"
-        )
+[nlp.tokenizer]
+@tokenizers = "spacy.KoreanTokenizer.v1"
+
+[nlp.writing_system]
+direction = "ltr"
+has_case = false
+has_letters = false
+"""


-# fmt: on
+@registry.language_data("spacy.ko.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


-def check_spaces(text, tokens):
-    prev_end = -1
-    start = 0
-    for token in tokens:
-        idx = text.find(token, start)
-        if prev_end > 0:
-            yield prev_end != idx
-        prev_end = idx + len(token)
-        start = prev_end
-    if start > 0:
-        yield False
+@registry.tokenizers("spacy.KoreanTokenizer.v1")
+def create_korean_tokenizer():
+    def korean_tokenizer_factory(nlp):
+        return KoreanTokenizer(nlp)
+
+    return korean_tokenizer_factory


 class KoreanTokenizer(DummyTokenizer):
-    def __init__(self, cls, nlp=None):
-        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
+    def __init__(self, nlp: Optional[Language] = None):
+        self.vocab = nlp.vocab
        MeCab = try_mecab_import()
        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")

    def __del__(self):
        self.mecab_tokenizer.__del__()

-    def __call__(self, text):
+    def __call__(self, text: str) -> Doc:
        dtokens = list(self.detailed_tokens(text))
        surfaces = [dt["surface"] for dt in dtokens]
        doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
@ -56,7 +57,7 @@ class KoreanTokenizer(DummyTokenizer):
        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
        return doc

-    def detailed_tokens(self, text):
+    def detailed_tokens(self, text: str) -> Dict[str, Any]:
        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
        # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
@ -72,23 +73,39 @@ class KoreanTokenizer(DummyTokenizer):


 class KoreanDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda _text: "ko"
-    stop_words = STOP_WORDS
    tag_map = TAG_MAP
-    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
-
-    @classmethod
-    def create_tokenizer(cls, nlp=None):
-        return KoreanTokenizer(cls, nlp)


 class Korean(Language):
    lang = "ko"
    Defaults = KoreanDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)

-    def make_doc(self, text):
-        return self.tokenizer(text)
+
+def try_mecab_import() -> None:
+    try:
+        from natto import MeCab
+
+        return MeCab
+    except ImportError:
+        raise ImportError(
+            "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
+            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
+            "and [natto-py](https://github.com/buruzaemon/natto-py)"
+        )
+
+
+def check_spaces(text, tokens):
+    prev_end = -1
+    start = 0
+    for token in tokens:
+        idx = text.find(token, start)
+        if prev_end > 0:
+            yield prev_end != idx
+        prev_end = idx + len(token)
+        start = prev_end
+    if start > 0:
+        yield False


 def pickle_korean(instance):
--- a/spacy/lang/lb/init.py
+++ b/spacy/lang/lb/init.py
@ -1,26 +1,49 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_INFIXES
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "lb"
+stop_words = {"@language_data": "spacy.lb.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.lb.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.lb.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class LuxembourgishDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "lb"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    infixes = TOKENIZER_INFIXES


 class Luxembourgish(Language):
    lang = "lb"
    Defaults = LuxembourgishDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Luxembourgish"]
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -1,3 +1,4 @@
+from typing import Set
 import unicodedata
 import re

@ -21,21 +22,21 @@ _tlds = set(
 )


-def is_punct(text):
+def is_punct(text: str) -> bool:
    for char in text:
        if not unicodedata.category(char).startswith("P"):
            return False
    return True


-def is_ascii(text):
+def is_ascii(text: str) -> bool:
    for char in text:
        if ord(char) >= 128:
            return False
    return True


-def like_num(text):
+def like_num(text: str) -> bool:
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    # can be overwritten by lang with list of number words
@ -49,64 +50,31 @@ def like_num(text):
    return False


-def is_bracket(text):
+def is_bracket(text: str) -> bool:
    brackets = ("(", ")", "[", "]", "{", "}", "<", ">")
    return text in brackets


-def is_quote(text):
-    quotes = (
-        '"',
-        "'",
-        "`",
-        "«",
-        "»",
-        "‘",
-        "’",
-        "‚",
-        "‛",
-        "“",
-        "”",
-        "„",
-        "‟",
-        "‹",
-        "›",
-        "❮",
-        "❯",
-        "''",
-        "``",
-    )
+def is_quote(text: str) -> bool:
+    # fmt: off
+    quotes = ('"', "'", "`", "«", "»", "‘", "’", "‚", "‛", "“", "”", "„", "‟", "‹", "›", "❮", "❯", "''", "``")
+    # fmt: on
    return text in quotes


-def is_left_punct(text):
-    left_punct = (
-        "(",
-        "[",
-        "{",
-        "<",
-        '"',
-        "'",
-        "«",
-        "‘",
-        "‚",
-        "‛",
-        "“",
-        "„",
-        "‟",
-        "‹",
-        "❮",
-        "``",
-    )
+def is_left_punct(text: str) -> bool:
+    # fmt: off
+    left_punct = ("(", "[", "{", "<", '"', "'", "«", "‘", "‚", "‛", "“", "„", "‟", "‹", "❮", "``")
+    # fmt: on
    return text in left_punct


-def is_right_punct(text):
+def is_right_punct(text: str) -> bool:
    right_punct = (")", "]", "}", ">", '"', "'", "»", "’", "”", "›", "❯", "''")
    return text in right_punct


-def is_currency(text):
+def is_currency(text: str) -> bool:
    # can be overwritten by lang with list of currency words, e.g. dollar, euro
    for char in text:
        if unicodedata.category(char) != "Sc":
@ -114,11 +82,11 @@ def is_currency(text):
    return True


-def like_email(text):
+def like_email(text: str) -> bool:
    return bool(_like_email(text))


-def like_url(text):
+def like_url(text: str) -> bool:
    # We're looking for things that function in text like URLs. So, valid URL
    # or not, anything they say http:// is going to be good.
    if text.startswith("http://") or text.startswith("https://"):
@ -144,7 +112,7 @@ def like_url(text):
    return False


-def word_shape(text):
+def word_shape(text: str) -> str:
    if len(text) >= 100:
        return "LONG"
    shape = []
@ -171,46 +139,52 @@ def word_shape(text):
    return "".join(shape)


-def lower(string):
+def lower(string: str) -> str:
    return string.lower()


-def prefix(string):
+def prefix(string: str) -> str:
    return string[0]


-def suffix(string):
+def suffix(string: str) -> str:
    return string[-3:]


-def is_alpha(string):
+def is_alpha(string: str) -> bool:
    return string.isalpha()


-def is_digit(string):
+def is_digit(string: str) -> bool:
    return string.isdigit()


-def is_lower(string):
+def is_lower(string: str) -> bool:
    return string.islower()


-def is_space(string):
+def is_space(string: str) -> bool:
    return string.isspace()


-def is_title(string):
+def is_title(string: str) -> bool:
    return string.istitle()


-def is_upper(string):
+def is_upper(string: str) -> bool:
    return string.isupper()


-def is_stop(string, stops=set()):
+def is_stop(string: str, stops: Set[str] = set()) -> bool:
    return string.lower() in stops


+def get_lang(text: str, lang: str = "") -> str:
+    # This function is partially applied so lang code can be passed in
+    # automatically while still allowing pickling
+    return lang
+
+
 LEX_ATTRS = {
    attrs.LOWER: lower,
    attrs.NORM: lower,
--- a/spacy/lang/lij/init.py
+++ b/spacy/lang/lij/init.py
@ -1,28 +1,35 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_INFIXES
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "lij"
+stop_words = {"@language_data": "spacy.lij.stop_words"}
+"""
+
+
+@registry.language_data("spacy.lij.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class LigurianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "lij"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    infixes = TOKENIZER_INFIXES


 class Ligurian(Language):
    lang = "lij"
    Defaults = LigurianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Ligurian"]
--- a/spacy/lang/lt/init.py
+++ b/spacy/lang/lt/init.py
@ -1,27 +1,41 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry


-def _return_lt(_):
-    return "lt"
+DEFAULT_CONFIG = """
+[nlp]
+lang = "lt"
+stop_words = {"@language_data": "spacy.lt.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.lt.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.lt.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class LithuanianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = _return_lt
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
-    lex_attr_getters.update(LEX_ATTRS)
-
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    mod_base_exceptions = {
@ -29,12 +43,12 @@ class LithuanianDefaults(Language.Defaults):
    }
    del mod_base_exceptions["8)"]
    tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS


 class Lithuanian(Language):
    lang = "lt"
    Defaults = LithuanianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Lithuanian"]
--- a/spacy/lang/lv/init.py
+++ b/spacy/lang/lv/init.py
@ -1,17 +1,26 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class LatvianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "lv"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "lv"
+stop_words = {"@language_data": "spacy.lv.stop_words"}
+"""
+
+
+@registry.language_data("spacy.lv.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class Latvian(Language):
    lang = "lv"
-    Defaults = LatvianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Latvian"]
--- a/spacy/lang/ml/init.py
+++ b/spacy/lang/ml/init.py
@ -1,15 +1,26 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
-
 from ...language import Language
+from ...util import registry


-class MalayalamDefaults(Language.Defaults):
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "ml"
+stop_words = {"@language_data": "spacy.ml.stop_words"}
+"""
+
+
+@registry.language_data("spacy.ml.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class Malayalam(Language):
    lang = "ml"
-    Defaults = MalayalamDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Malayalam"]
--- a/spacy/lang/mr/init.py
+++ b/spacy/lang/mr/init.py
@ -1,17 +1,26 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class MarathiDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "mr"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "af"
+stop_words = {"@language_data": "spacy.mr.stop_words"}
+"""
+
+
+@registry.language_data("spacy.mr.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class Marathi(Language):
    lang = "mr"
-    Defaults = MarathiDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Marathi"]
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -1,33 +1,47 @@
+from typing import Set
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "nb"
+stop_words = {"@language_data": "spacy.nb.stop_words"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.nb.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class NorwegianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "nb"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
-    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS


 class Norwegian(Language):
    lang = "nb"
    Defaults = NorwegianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Norwegian"]
--- a/spacy/lang/ne/init.py
+++ b/spacy/lang/ne/init.py
@ -1,23 +1,33 @@
-# coding: utf8
-from __future__ import unicode_literals
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config

 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class NepaliDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "ne" # Nepali language ISO code
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "ne"
+stop_words = {"@language_data": "spacy.ne.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
+"""
+
+
+@registry.language_data("spacy.ne.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.ne.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class Nepali(Language):
    lang = "ne"
-    Defaults = NepaliDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Nepali"]
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -1,3 +1,6 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -5,36 +8,51 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .lemmatizer import DutchLemmatizer
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lookups import Lookups
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "nl"
+stop_words = {"@language_data": "spacy.nl.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.DutchLemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.nl.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.nl.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS
+
+
+@registry.lemmatizers("spacy.DutchLemmatizer.v1")
+def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer:
+    return DutchLemmatizer(data_paths=data_paths)


 class DutchDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "nl"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES

-    @classmethod
-    def create_lemmatizer(cls, nlp=None, lookups=None):
-        if lookups is None:
-            lookups = Lookups()
-        return DutchLemmatizer(lookups)
-

 class Dutch(Language):
    lang = "nl"
    Defaults = DutchDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Dutch"]
--- a/spacy/lang/nl/lemmatizer.py
+++ b/spacy/lang/nl/lemmatizer.py
@ -1,3 +1,5 @@
+from typing import Optional, List, Dict, Tuple
+
 from ...lemmatizer import Lemmatizer
 from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV

@ -34,7 +36,9 @@ class DutchLemmatizer(Lemmatizer):
        "num": "num",
    }

-    def __call__(self, string, univ_pos, morphology=None):
+    def __call__(
+        self, string: str, univ_pos: str, morphology: Optional[dict] = None
+    ) -> List[str]:
        # Difference 1: self.rules is assumed to be non-None, so no
        # 'is None' check required.
        # String lowercased from the get-go. All lemmatization results in
@ -92,7 +96,7 @@ class DutchLemmatizer(Lemmatizer):
    # Overrides parent method so that a lowercased version of the string is
    # used to search the lookup table. This is necessary because our lookup
    # table consists entirely of lowercase keys.
-    def lookup(self, string, orth=None):
+    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        string = string.lower()
        if orth is not None:
@ -102,7 +106,13 @@ class DutchLemmatizer(Lemmatizer):

    # Reimplemented to focus more on application of suffix rules and to return
    # as early as possible.
-    def lemmatize(self, string, index, exceptions, rules):
+    def lemmatize(
+        self,
+        string: str,
+        index: Dict[str, List[str]],
+        exceptions: Dict[str, Dict[str, List[str]]],
+        rules: Dict[str, List[List[str]]],
+    ) -> Tuple[List[str], bool]:
        # returns (forms, is_known: bool)
        oov_forms = []
        for old, new in rules:
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -1,43 +1,60 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import PolishLemmatizer
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import add_lookups
-from ...lookups import Lookups
+from ...util import registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "pl"
+stop_words = {"@language_data": "spacy.pl.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.PolishLemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.pl.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.pl.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS
+
+
+@registry.lemmatizers("spacy.PolishLemmatizer.v1")
+def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer:
+    return PolishLemmatizer(data_paths=data_paths)


 class PolishDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "pl"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    mod_base_exceptions = {
        exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
    }
    tokenizer_exceptions = mod_base_exceptions
-    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES

-    @classmethod
-    def create_lemmatizer(cls, nlp=None, lookups=None):
-        if lookups is None:
-            lookups = Lookups()
-        return PolishLemmatizer(lookups)
-

 class Polish(Language):
    lang = "pl"
    Defaults = PolishDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Polish"]
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@ -1,3 +1,5 @@
+from typing import Optional, List, Dict
+
 from ...lemmatizer import Lemmatizer
 from ...parts_of_speech import NAMES

@ -7,30 +9,29 @@ class PolishLemmatizer(Lemmatizer):
    # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
    # It utilizes some prefix based improvements for verb and adjectives
    # lemmatization, as well as case-sensitive lemmatization for nouns.
-    def __call__(self, string, univ_pos, morphology=None):
+    def __call__(
+        self, string: str, univ_pos: str, morphology: Optional[dict] = None
+    ) -> List[str]:
        if isinstance(univ_pos, int):
            univ_pos = NAMES.get(univ_pos, "X")
        univ_pos = univ_pos.upper()
-
        lookup_pos = univ_pos.lower()
        if univ_pos == "PROPN":
            lookup_pos = "noun"
        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
-
        if univ_pos == "NOUN":
            return self.lemmatize_noun(string, morphology, lookup_table)
-
        if univ_pos != "PROPN":
            string = string.lower()
-
        if univ_pos == "ADJ":
            return self.lemmatize_adj(string, morphology, lookup_table)
        elif univ_pos == "VERB":
            return self.lemmatize_verb(string, morphology, lookup_table)
-
        return [lookup_table.get(string, string.lower())]

-    def lemmatize_adj(self, string, morphology, lookup_table):
+    def lemmatize_adj(
+        self, string: str, morphology: dict, lookup_table: Dict[str, str]
+    ) -> List[str]:
        # this method utilizes different procedures for adjectives
        # with 'nie' and 'naj' prefixes
        if string[:3] == "nie":
@ -41,25 +42,26 @@ class PolishLemmatizer(Lemmatizer):
                    return [lookup_table[naj_search_string]]
            if search_string in lookup_table:
                return [lookup_table[search_string]]
-
        if string[:3] == "naj":
            naj_search_string = string[3:]
            if naj_search_string in lookup_table:
                return [lookup_table[naj_search_string]]
-
        return [lookup_table.get(string, string)]

-    def lemmatize_verb(self, string, morphology, lookup_table):
+    def lemmatize_verb(
+        self, string: str, morphology: dict, lookup_table: Dict[str, str]
+    ) -> List[str]:
        # this method utilizes a different procedure for verbs
        # with 'nie' prefix
        if string[:3] == "nie":
            search_string = string[3:]
            if search_string in lookup_table:
                return [lookup_table[search_string]]
-
        return [lookup_table.get(string, string)]

-    def lemmatize_noun(self, string, morphology, lookup_table):
+    def lemmatize_noun(
+        self, string: str, morphology: dict, lookup_table: Dict[str, str]
+    ) -> List[str]:
        # this method is case-sensitive, in order to work
        # for incorrectly tagged proper names
        if string != string.lower():
@ -68,11 +70,16 @@ class PolishLemmatizer(Lemmatizer):
            elif string in lookup_table:
                return [lookup_table[string]]
            return [string.lower()]
-
        return [lookup_table.get(string, string)]

-    def lookup(self, string, orth=None):
+    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        return string.lower()

-    def lemmatize(self, string, index, exceptions, rules):
+    def lemmatize(
+        self,
+        string: str,
+        index: Dict[str, List[str]],
+        exceptions: Dict[str, Dict[str, List[str]]],
+        rules: Dict[str, List[List[str]]],
+    ) -> List[str]:
        raise NotImplementedError
--- a/spacy/lang/pt/init.py
+++ b/spacy/lang/pt/init.py
@ -1,20 +1,42 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "pt"
+stop_words = {"@language_data": "spacy.pt.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.pt.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.pt.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class PortugueseDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "pt"
-    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    infixes = TOKENIZER_INFIXES
    prefixes = TOKENIZER_PREFIXES

@ -22,6 +44,7 @@ class PortugueseDefaults(Language.Defaults):
 class Portuguese(Language):
    lang = "pt"
    Defaults = PortugueseDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Portuguese"]
--- a/spacy/lang/ro/init.py
+++ b/spacy/lang/ro/init.py
@ -1,27 +1,40 @@
+from typing import Set
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry

 # Lemma data note:
 # Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
 # Replaced characters using cedillas with the correct ones (ș and ț)


+DEFAULT_CONFIG = """
+[nlp]
+lang = "ro"
+stop_words = {"@language_data": "spacy.ro.stop_words"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.ro.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
 class RomanianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "ro"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
@ -30,6 +43,7 @@ class RomanianDefaults(Language.Defaults):
 class Romanian(Language):
    lang = "ro"
    Defaults = RomanianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Romanian"]
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -1,32 +1,49 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import RussianLemmatizer
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...util import update_exc
+from ...util import update_exc, registry
 from ...language import Language
-from ...lookups import Lookups
-from ...attrs import LANG
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "ru"
+stop_words = {"@language_data": "spacy.ru.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.RussianLemmatizer.v1"
+"""
+
+
+@registry.language_data("spacy.ru.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.ru.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS
+
+
+@registry.lemmatizers("spacy.RussianLemmatizer.v1")
+def create_russian_lemmatizer() -> RussianLemmatizer:
+    return RussianLemmatizer()


 class RussianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "ru"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None, lookups=None):
-        if lookups is None:
-            lookups = Lookups()
-        return RussianLemmatizer(lookups)


 class Russian(Language):
    lang = "ru"
    Defaults = RussianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Russian"]
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -1,11 +1,17 @@
+from typing import Optional, Tuple, Dict, List
+
 from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
 from ...lemmatizer import Lemmatizer
+from ...lookups import Lookups
+
+
+PUNCT_RULES = {"«": '"', "»": '"'}


 class RussianLemmatizer(Lemmatizer):
    _morph = None

-    def __init__(self, lookups=None):
+    def __init__(self, lookups: Optional[Lookups] = None) -> None:
        super(RussianLemmatizer, self).__init__(lookups)
        try:
            from pymorphy2 import MorphAnalyzer
@ -19,15 +25,15 @@ class RussianLemmatizer(Lemmatizer):
        if RussianLemmatizer._morph is None:
            RussianLemmatizer._morph = MorphAnalyzer()

-    def __call__(self, string, univ_pos, morphology=None):
+    def __call__(
+        self, string: str, univ_pos: str, morphology: Optional[dict] = None
+    ) -> List[str]:
        univ_pos = self.normalize_univ_pos(univ_pos)
        if univ_pos == "PUNCT":
            return [PUNCT_RULES.get(string, string)]
-
        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
            # Skip unchangeable pos
            return [string.lower()]
-
        analyses = self._morph.parse(string)
        filtered_analyses = []
        for analysis in analyses:
@ -39,12 +45,10 @@ class RussianLemmatizer(Lemmatizer):
                analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
            ):
                filtered_analyses.append(analysis)
-
        if not len(filtered_analyses):
            return [string.lower()]
        if morphology is None or (len(morphology) == 1 and POS in morphology):
            return list(set([analysis.normal_form for analysis in filtered_analyses]))
-
        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
            features_to_compare = ["Case", "Number", "Gender"]
        elif univ_pos == "NUM":
@ -61,7 +65,6 @@ class RussianLemmatizer(Lemmatizer):
                "VerbForm",
                "Voice",
            ]
-
        analyses, filtered_analyses = filtered_analyses, []
        for analysis in analyses:
            _, analysis_morph = oc2ud(str(analysis.tag))
@ -74,16 +77,14 @@ class RussianLemmatizer(Lemmatizer):
                    break
            else:
                filtered_analyses.append(analysis)
-
        if not len(filtered_analyses):
            return [string.lower()]
        return list(set([analysis.normal_form for analysis in filtered_analyses]))

    @staticmethod
-    def normalize_univ_pos(univ_pos):
+    def normalize_univ_pos(univ_pos: str) -> Optional[str]:
        if isinstance(univ_pos, str):
            return univ_pos.upper()
-
        symbols_to_str = {
            ADJ: "ADJ",
            DET: "DET",
@ -98,14 +99,14 @@ class RussianLemmatizer(Lemmatizer):
            return symbols_to_str[univ_pos]
        return None

-    def lookup(self, string, orth=None):
+    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        analyses = self._morph.parse(string)
        if len(analyses) == 1:
            return analyses[0].normal_form
        return string


-def oc2ud(oc_tag):
+def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
    gram_map = {
        "_POS": {
            "ADJF": "ADJ",
@ -160,11 +161,9 @@ def oc2ud(oc_tag):
        "Voice": {"actv": "Act", "pssv": "Pass"},
        "Abbr": {"Abbr": "Yes"},
    }
-
    pos = "X"
    morphology = dict()
    unmatched = set()
-
    grams = oc_tag.replace(" ", ",").split(",")
    for gram in grams:
        match = False
@ -177,7 +176,6 @@ def oc2ud(oc_tag):
                    morphology[categ] = gmap[gram]
        if not match:
            unmatched.add(gram)
-
    while len(unmatched) > 0:
        gram = unmatched.pop()
        if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
@ -186,8 +184,4 @@ def oc2ud(oc_tag):
            pos = "AUX"
        elif gram == "Pltm":
            morphology["Number"] = "Ptan"
-
    return pos, morphology
-
-
-PUNCT_RULES = {"«": '"', "»": '"'}
--- a/spacy/lang/si/init.py
+++ b/spacy/lang/si/init.py
@ -1,20 +1,33 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class SinhalaDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "si"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "si"
+stop_words = {"@language_data": "spacy.si.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
+"""
+
+
+@registry.language_data("spacy.si.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.si.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class Sinhala(Language):
    lang = "si"
-    Defaults = SinhalaDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Sinhala"]
--- a/spacy/lang/sk/init.py
+++ b/spacy/lang/sk/init.py
@ -1,20 +1,33 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class SlovakDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "sk"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "sk"
+stop_words = {"@language_data": "spacy.sk.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
+"""
+
+
+@registry.language_data("spacy.sk.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.sk.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class Slovak(Language):
    lang = "sk"
-    Defaults = SlovakDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Slovak"]
--- a/spacy/lang/sl/init.py
+++ b/spacy/lang/sl/init.py
@ -1,17 +1,26 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class SlovenianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "sl"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "sl"
+stop_words = {"@language_data": "spacy.sl.stop_words"}
+"""
+
+
+@registry.language_data("spacy.sl.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class Slovenian(Language):
    lang = "sl"
-    Defaults = SlovenianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Slovenian"]
--- a/spacy/lang/sq/init.py
+++ b/spacy/lang/sq/init.py
@ -1,17 +1,26 @@
+from typing import Set
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class AlbanianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "sq"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "sq"
+stop_words = {"@language_data": "spacy.sq.stop_words"}
+"""
+
+
+@registry.language_data("spacy.sq.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class Albanian(Language):
    lang = "sq"
-    Defaults = AlbanianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Albanian"]
--- a/spacy/lang/sr/init.py
+++ b/spacy/lang/sr/init.py
@ -1,23 +1,47 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "sr"
+stop_words = {"@language_data": "spacy.sr.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.sr.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.sr.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class SerbianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "sr"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS


 class Serbian(Language):
    lang = "sr"
    Defaults = SerbianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Serbian"]
--- a/spacy/lang/sv/init.py
+++ b/spacy/lang/sv/init.py
@ -1,35 +1,54 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...language import Language
+from ...util import update_exc, registry
+from .syntax_iterators import SYNTAX_ITERATORS

 # Punctuation stolen from Danish
 from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES

-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
-from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
-from .syntax_iterators import SYNTAX_ITERATORS
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "sv"
+stop_words = {"@language_data": "spacy.sv.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.sv.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.sv.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class SwedishDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "sv"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
-    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS


 class Swedish(Language):
    lang = "sv"
    Defaults = SwedishDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Swedish"]
--- a/spacy/lang/ta/init.py
+++ b/spacy/lang/ta/init.py
@ -1,20 +1,33 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class TamilDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "ta"
-    lex_attr_getters.update(LEX_ATTRS)
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "ta"
+stop_words = {"@language_data": "spacy.ta.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
+"""
+
+
+@registry.language_data("spacy.ta.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.ta.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class Tamil(Language):
    lang = "ta"
-    Defaults = TamilDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Tamil"]
--- a/spacy/lang/te/init.py
+++ b/spacy/lang/te/init.py
@ -1,20 +1,33 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry


-class TeluguDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "te"
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "te"
+stop_words = {"@language_data": "spacy.te.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
+"""
+
+
+@registry.language_data("spacy.te.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.te.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class Telugu(Language):
    lang = "te"
-    Defaults = TeluguDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Telugu"]
--- a/spacy/lang/th/init.py
+++ b/spacy/lang/th/init.py
@ -1,15 +1,44 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-
-from ...attrs import LANG
 from ...language import Language
 from ...tokens import Doc
-from ...util import DummyTokenizer
+from ...util import DummyTokenizer, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "th"
+stop_words = {"@language_data": "spacy.th.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
+
+[nlp.tokenizer]
+@tokenizers = "spacy.ThaiTokenizer.v1"
+"""
+
+
+@registry.language_data("spacy.th.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.th.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS
+
+
+@registry.tokenizers("spacy.ThaiTokenizer.v1")
+def create_thai_tokenizer():
+    def thai_tokenizer_factory(nlp):
+        return ThaiTokenizer(nlp)
+
+    return thai_tokenizer_factory


 class ThaiTokenizer(DummyTokenizer):
-    def __init__(self, cls, nlp=None):
+    def __init__(self, nlp: Language) -> None:
        try:
            from pythainlp.tokenize import word_tokenize
        except ImportError:
@ -17,34 +46,18 @@ class ThaiTokenizer(DummyTokenizer):
                "The Thai tokenizer requires the PyThaiNLP library: "
                "https://github.com/PyThaiNLP/pythainlp"
            )
-
        self.word_tokenize = word_tokenize
-        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
+        self.vocab = nlp.vocab

-    def __call__(self, text):
+    def __call__(self, text: str) -> Doc:
        words = list(self.word_tokenize(text))
        spaces = [False] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)


-class ThaiDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda _text: "th"
-    tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
-
-    @classmethod
-    def create_tokenizer(cls, nlp=None):
-        return ThaiTokenizer(cls, nlp)
-
-
 class Thai(Language):
    lang = "th"
-    Defaults = ThaiDefaults
-
-    def make_doc(self, text):
-        return self.tokenizer(text)
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Thai"]
--- a/spacy/lang/tl/init.py
+++ b/spacy/lang/tl/init.py
@ -1,31 +1,47 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry


-def _return_tl(_):
-    return "tl"
+DEFAULT_CONFIG = """
+[nlp]
+lang = "tl"
+stop_words = {"@language_data": "spacy.tl.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.tl.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.tl.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class TagalogDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = _return_tl
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
-    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS


 class Tagalog(Language):
    lang = "tl"
    Defaults = TagalogDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Tagalog"]
--- a/spacy/lang/tr/init.py
+++ b/spacy/lang/tr/init.py
@ -1,26 +1,40 @@
+from typing import Set
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "tr"
+stop_words = {"@language_data": "spacy.tr.stop_words"}
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.tr.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS


 class TurkishDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "tr"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS


 class Turkish(Language):
    lang = "tr"
    Defaults = TurkishDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Turkish"]
--- a/spacy/lang/tt/init.py
+++ b/spacy/lang/tt/init.py
@ -1,28 +1,42 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...attrs import LANG
 from ...language import Language
-from ...util import update_exc
+from ...util import update_exc, registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "tt"
+stop_words = {"@language_data": "spacy.tt.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"}
+"""
+
+
+@registry.language_data("spacy.tt.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.tt.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class TatarDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "tt"
-
-    lex_attr_getters.update(LEX_ATTRS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    infixes = tuple(TOKENIZER_INFIXES)

-    stop_words = STOP_WORDS
-

 class Tatar(Language):
    lang = "tt"
    Defaults = TatarDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Tatar"]
--- a/spacy/lang/uk/init.py
+++ b/spacy/lang/uk/init.py
@ -1,36 +1,49 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
-from ...util import update_exc, add_lookups
+from ...util import update_exc, registry
 from ...language import Language
-from ...lookups import Lookups
-from ...attrs import LANG, NORM
 from .lemmatizer import UkrainianLemmatizer


-class UkrainianDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "uk"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
-    lex_attr_getters.update(LEX_ATTRS)
-    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = STOP_WORDS
+DEFAULT_CONFIG = """
+[nlp]
+lang = "uk"
+stop_words = {"@language_data": "spacy.uk.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"}

-    @classmethod
-    def create_lemmatizer(cls, nlp=None, lookups=None):
-        if lookups is None:
-            lookups = Lookups()
-        return UkrainianLemmatizer(lookups)
+[nlp.lemmatizer]
+@lemmatizers = "spacy.UkrainianLemmatizer.v1"
+"""
+
+
+@registry.language_data("spacy.uk.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.uk.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS
+
+
+@registry.lemmatizers("spacy.UkrainianLemmatizer.v1")
+def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
+    return UkrainianLemmatizer()
+
+
+class UkrainianDefaults(Language.Defaults):
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)


 class Ukrainian(Language):
    lang = "uk"
    Defaults = UkrainianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Ukrainian"]
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -1,11 +1,17 @@
+from typing import Optional, List, Tuple, Dict
+
 from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
+from ...lookups import Lookups
 from ...lemmatizer import Lemmatizer


+PUNCT_RULES = {"«": '"', "»": '"'}
+
+
 class UkrainianLemmatizer(Lemmatizer):
    _morph = None

-    def __init__(self, lookups=None):
+    def __init__(self, lookups: Optional[Lookups] = None) -> None:
        super(UkrainianLemmatizer, self).__init__(lookups)
        try:
            from pymorphy2 import MorphAnalyzer
@ -19,15 +25,15 @@ class UkrainianLemmatizer(Lemmatizer):
                '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
            )

-    def __call__(self, string, univ_pos, morphology=None):
+    def __call__(
+        self, string: str, univ_pos: str, morphology: Optional[dict] = None
+    ) -> List[str]:
        univ_pos = self.normalize_univ_pos(univ_pos)
        if univ_pos == "PUNCT":
            return [PUNCT_RULES.get(string, string)]
-
        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
            # Skip unchangeable pos
            return [string.lower()]
-
        analyses = self._morph.parse(string)
        filtered_analyses = []
        for analysis in analyses:
@ -39,12 +45,10 @@ class UkrainianLemmatizer(Lemmatizer):
                analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
            ):
                filtered_analyses.append(analysis)
-
        if not len(filtered_analyses):
            return [string.lower()]
        if morphology is None or (len(morphology) == 1 and POS in morphology):
            return list(set([analysis.normal_form for analysis in filtered_analyses]))
-
        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
            features_to_compare = ["Case", "Number", "Gender"]
        elif univ_pos == "NUM":
@ -61,7 +65,6 @@ class UkrainianLemmatizer(Lemmatizer):
                "VerbForm",
                "Voice",
            ]
-
        analyses, filtered_analyses = filtered_analyses, []
        for analysis in analyses:
            _, analysis_morph = oc2ud(str(analysis.tag))
@ -74,16 +77,14 @@ class UkrainianLemmatizer(Lemmatizer):
                    break
            else:
                filtered_analyses.append(analysis)
-
        if not len(filtered_analyses):
            return [string.lower()]
        return list(set([analysis.normal_form for analysis in filtered_analyses]))

    @staticmethod
-    def normalize_univ_pos(univ_pos):
+    def normalize_univ_pos(univ_pos: str) -> Optional[str]:
        if isinstance(univ_pos, str):
            return univ_pos.upper()
-
        symbols_to_str = {
            ADJ: "ADJ",
            DET: "DET",
@ -98,14 +99,14 @@ class UkrainianLemmatizer(Lemmatizer):
            return symbols_to_str[univ_pos]
        return None

-    def lookup(self, string, orth=None):
+    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        analyses = self._morph.parse(string)
        if len(analyses) == 1:
            return analyses[0].normal_form
        return string


-def oc2ud(oc_tag):
+def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
    gram_map = {
        "_POS": {
            "ADJF": "ADJ",
@ -160,11 +161,9 @@ def oc2ud(oc_tag):
        "Voice": {"actv": "Act", "pssv": "Pass"},
        "Abbr": {"Abbr": "Yes"},
    }
-
    pos = "X"
    morphology = dict()
    unmatched = set()
-
    grams = oc_tag.replace(" ", ",").split(",")
    for gram in grams:
        match = False
@ -177,7 +176,6 @@ def oc2ud(oc_tag):
                    morphology[categ] = gmap[gram]
        if not match:
            unmatched.add(gram)
-
    while len(unmatched) > 0:
        gram = unmatched.pop()
        if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
@ -186,8 +184,4 @@ def oc2ud(oc_tag):
            pos = "AUX"
        elif gram == "Pltm":
            morphology["Number"] = "Ptan"
-
    return pos, morphology
-
-
-PUNCT_RULES = {"«": '"', "»": '"'}
--- a/spacy/lang/ur/init.py
+++ b/spacy/lang/ur/init.py
@ -1,26 +1,53 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
-
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "ur"
+stop_words = {"@language_data": "spacy.ur.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"}
+
+[nlp.writing_system]
+direction = "rtl"
+has_case = false
+has_letters = true
+
+[nlp.lemmatizer]
+@lemmatizers = "spacy.Lemmatizer.v1"
+
+[nlp.lemmatizer.data_paths]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+"""
+
+
+@registry.language_data("spacy.ur.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.ur.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class UrduDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "ur"
-
    tokenizer_exceptions = BASE_EXCEPTIONS
-    stop_words = STOP_WORDS
    suffixes = TOKENIZER_SUFFIXES
-    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}


 class Urdu(Language):
    lang = "ur"
    Defaults = UrduDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Urdu"]
--- a/spacy/lang/vi/init.py
+++ b/spacy/lang/vi/init.py
@ -1,38 +1,62 @@
-from ...attrs import LANG, NORM
-from ..norm_exceptions import BASE_NORMS
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from ...language import Language
 from ...tokens import Doc
 from .stop_words import STOP_WORDS
-from ...util import add_lookups
+from ...util import DummyTokenizer, registry
 from .lex_attrs import LEX_ATTRS


-class VietnameseDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "vi"  # for pickling
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
-    lex_attr_getters.update(LEX_ATTRS)
-    stop_words = STOP_WORDS
-    use_pyvi = True
+DEFAULT_CONFIG = """
+[nlp]
+lang = "vi"
+stop_words = {"@language_data": "spacy.vi.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"}
+
+[nlp.tokenizer]
+@tokenizers = "spacy.VietnameseTokenizer.v1"
+use_pyvi = true
+"""


-class Vietnamese(Language):
-    lang = "vi"
-    Defaults = VietnameseDefaults  # override defaults
+@registry.language_data("spacy.vi.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS

-    def make_doc(self, text):
-        if self.Defaults.use_pyvi:
+
+@registry.language_data("spacy.vi.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS
+
+
+@registry.tokenizers("spacy.VietnameseTokenizer.v1")
+def create_vietnamese_tokenizer(use_pyvi: bool = True,):
+    def vietnamese_tokenizer_factory(nlp):
+        return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
+
+    return vietnamese_tokenizer_factory
+
+
+class VietnameseTokenizer(DummyTokenizer):
+    def __init__(self, nlp: Language, use_pyvi: bool = False):
+        self.vocab = nlp.vocab
+        self.use_pyvi = use_pyvi
+        if self.use_pyvi:
            try:
                from pyvi import ViTokenizer
+
+                self.ViTokenizer = ViTokenizer
            except ImportError:
                msg = (
-                    "Pyvi not installed. Either set Vietnamese.use_pyvi = False, "
+                    "Pyvi not installed. Either set use_pyvi = False, "
                    "or install it https://pypi.python.org/pypi/pyvi"
                )
                raise ImportError(msg)
-            words, spaces = ViTokenizer.spacy_tokenize(text)
+
+    def __call__(self, text: str) -> Doc:
+        if self.use_pyvi:
+            words, spaces = self.ViTokenizer.spacy_tokenize(text)
            return Doc(self.vocab, words=words, spaces=spaces)
        else:
            words = []
@ -44,4 +68,9 @@ class Vietnamese(Language):
            return Doc(self.vocab, words=words, spaces=spaces)


+class Vietnamese(Language):
+    lang = "vi"
+    default_config = Config().from_str(DEFAULT_CONFIG)
+
+
 __all__ = ["Vietnamese"]
--- a/spacy/lang/xx/init.py
+++ b/spacy/lang/xx/init.py
@ -1,17 +1,17 @@
+from thinc.api import Config
+
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "xx"
+"""


 class MultiLanguageDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "xx"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
-    )
-    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    tokenizer_exceptions = BASE_EXCEPTIONS


 class MultiLanguage(Language):
@ -21,6 +21,7 @@ class MultiLanguage(Language):

    lang = "xx"
    Defaults = MultiLanguageDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["MultiLanguage"]
--- a/spacy/lang/yo/init.py
+++ b/spacy/lang/yo/init.py
@ -1,21 +1,39 @@
+from typing import Set, Dict, Callable, Any
+from thinc.api import Config
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
+
+
+DEFAULT_CONFIG = """
+[nlp]
+lang = "si"
+stop_words = {"@language_data": "spacy.yo.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"}
+"""
+
+
+@registry.language_data("spacy.yo.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.yo.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS


 class YorubaDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "yo"
-    stop_words = STOP_WORDS
    tokenizer_exceptions = BASE_EXCEPTIONS


 class Yoruba(Language):
    lang = "yo"
    Defaults = YorubaDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)


 __all__ = ["Yoruba"]
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -1,13 +1,15 @@
+from typing import Optional, List, Set, Dict, Callable, Any
+from enum import Enum
 import tempfile
 import srsly
 import warnings
 from pathlib import Path
-from collections import OrderedDict
-from ...attrs import LANG
+from thinc.api import Config
+
 from ...errors import Warnings, Errors
 from ...language import Language
 from ...tokens import Doc
-from ...util import DummyTokenizer
+from ...util import DummyTokenizer, registry
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
@ -16,88 +18,103 @@ from ... import util

 _PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"

+DEFAULT_CONFIG = """
+[nlp]
+lang = "zh"
+stop_words = {"@language_data": "spacy.zh.stop_words"}
+lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"}

-def try_jieba_import(segmenter):
-    try:
-        import jieba
+[nlp.tokenizer]
+@tokenizers = "spacy.ChineseTokenizer.v1"
+segmenter = "char"
+pkuseg_model = null
+pkuseg_user_dict = "default"

-        if segmenter == "jieba":
-            # segment a short text to have jieba initialize its cache in advance
-            list(jieba.cut("作为", cut_all=False))
-
-        return jieba
-    except ImportError:
-        if segmenter == "jieba":
-            msg = (
-                "Jieba not installed. To use jieba, install it with `pip "
-                " install jieba` or from https://github.com/fxsjy/jieba"
-            )
-            raise ImportError(msg)
+[nlp.writing_system]
+direction = "ltr"
+has_case = false
+has_letters = false
+"""


-def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict):
-    try:
-        import pkuseg
+class Segmenter(str, Enum):
+    char = "char"
+    jieba = "jieba"
+    pkuseg = "pkuseg"

-        if pkuseg_model:
-            return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
-        elif segmenter == "pkuseg":
-            msg = (
-                "The Chinese word segmenter is 'pkuseg' but no pkuseg model "
-                "was specified. Please provide the name of a pretrained model "
-                "or the path to a model with "
-                '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
-                'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`'
-            )
-            raise ValueError(msg)
-    except ImportError:
-        if segmenter == "pkuseg":
-            msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
-            raise ImportError(msg)
-    except FileNotFoundError:
-        if segmenter == "pkuseg":
-            msg = "Unable to load pkuseg model from: " + pkuseg_model
-            raise FileNotFoundError(msg)
+    @classmethod
+    def values(cls):
+        return list(cls.__members__.keys())
+
+
+@registry.language_data("spacy.zh.stop_words")
+def stop_words() -> Set[str]:
+    return STOP_WORDS
+
+
+@registry.language_data("spacy.zh.lex_attr_getters")
+def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
+    return LEX_ATTRS
+
+
+@registry.tokenizers("spacy.ChineseTokenizer.v1")
+def create_chinese_tokenizer(
+    segmenter: Segmenter = Segmenter.char,
+    pkuseg_model: Optional[str] = None,
+    pkuseg_user_dict: Optional[str] = "default",
+):
+    def chinese_tokenizer_factory(nlp):
+        return ChineseTokenizer(
+            nlp,
+            segmenter=segmenter,
+            pkuseg_model=pkuseg_model,
+            pkuseg_user_dict=pkuseg_user_dict,
+        )
+
+    return chinese_tokenizer_factory


 class ChineseTokenizer(DummyTokenizer):
-    def __init__(self, cls, nlp=None, config={}):
-        self.supported_segmenters = ("char", "jieba", "pkuseg")
-        self.configure_segmenter(config)
-        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
-        # remove relevant settings from config so they're not also saved in
-        # Language.meta
-        for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]:
-            if key in config:
-                del config[key]
-        self.tokenizer = Language.Defaults().create_tokenizer(nlp)
+    def __init__(
+        self,
+        nlp: Language,
+        segmenter: Segmenter = Segmenter.char,
+        pkuseg_model: Optional[str] = None,
+        pkuseg_user_dict: Optional[str] = None,
+    ):
+        self.vocab = nlp.vocab
+        if isinstance(segmenter, Segmenter):  # we might have the Enum here
+            segmenter = segmenter.value
+        self.segmenter = segmenter
+        self.pkuseg_model = pkuseg_model
+        self.pkuseg_user_dict = pkuseg_user_dict
+        self.pkuseg_seg = None
+        self.jieba_seg = None
+        self.configure_segmenter(segmenter)

-    def configure_segmenter(self, config):
-        self.segmenter = "char"
-        if "segmenter" in config:
-            if config["segmenter"] in self.supported_segmenters:
-                self.segmenter = config["segmenter"]
-            else:
-                warn_msg = Warnings.W103.format(
-                    lang="Chinese",
-                    segmenter=config["segmenter"],
-                    supported=", ".join([repr(s) for s in self.supported_segmenters]),
-                    default="'char' (character segmentation)",
-                )
-                warnings.warn(warn_msg)
+    def configure_segmenter(self, segmenter: str):
+        if segmenter not in Segmenter.values():
+            warn_msg = Warnings.W103.format(
+                lang="Chinese",
+                segmenter=segmenter,
+                supported=", ".join(Segmenter.values()),
+                default="'char' (character segmentation)",
+            )
+            warnings.warn(warn_msg)
+            self.segmenter = Segmenter.char
        self.jieba_seg = try_jieba_import(self.segmenter)
        self.pkuseg_seg = try_pkuseg_import(
            self.segmenter,
-            pkuseg_model=config.get("pkuseg_model", None),
-            pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
+            pkuseg_model=self.pkuseg_model,
+            pkuseg_user_dict=self.pkuseg_user_dict,
        )

-    def __call__(self, text):
-        if self.segmenter == "jieba":
+    def __call__(self, text: str) -> Doc:
+        if self.segmenter == Segmenter.jieba:
            words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
            (words, spaces) = util.get_words_and_spaces(words, text)
            return Doc(self.vocab, words=words, spaces=spaces)
-        elif self.segmenter == "pkuseg":
+        elif self.segmenter == Segmenter.pkuseg:
            if self.pkuseg_seg is None:
                raise ValueError(Errors.E1000)
            words = self.pkuseg_seg.cut(text)
@ -105,11 +122,11 @@ class ChineseTokenizer(DummyTokenizer):
            return Doc(self.vocab, words=words, spaces=spaces)

        # warn if segmenter setting is not the only remaining option "char"
-        if self.segmenter != "char":
+        if self.segmenter != Segmenter.char:
            warn_msg = Warnings.W103.format(
                lang="Chinese",
                segmenter=self.segmenter,
-                supported=", ".join([repr(s) for s in self.supported_segmenters]),
+                supported=", ".join(Segmenter.values()),
                default="'char' (character segmentation)",
            )
            warnings.warn(warn_msg)
@ -119,33 +136,25 @@ class ChineseTokenizer(DummyTokenizer):
        (words, spaces) = util.get_words_and_spaces(words, text)
        return Doc(self.vocab, words=words, spaces=spaces)

-    def pkuseg_update_user_dict(self, words, reset=False):
-        if self.segmenter == "pkuseg":
+    def pkuseg_update_user_dict(self, words: List[str], reset: bool = False):
+        if self.segmenter == Segmenter.pkuseg:
            if reset:
                try:
                    import pkuseg

                    self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
                except ImportError:
-                    if self.segmenter == "pkuseg":
-                        msg = (
-                            "pkuseg not installed: unable to reset pkuseg "
-                            "user dict. Please " + _PKUSEG_INSTALL_MSG
-                        )
-                        raise ImportError(msg)
+                    msg = (
+                        "pkuseg not installed: unable to reset pkuseg "
+                        "user dict. Please " + _PKUSEG_INSTALL_MSG
+                    )
+                    raise ImportError(msg)
            for word in words:
                self.pkuseg_seg.preprocesser.insert(word.strip(), "")
        else:
            warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
            warnings.warn(warn_msg)

-    def _get_config(self):
-        config = OrderedDict((("segmenter", self.segmenter),))
-        return config
-
-    def _set_config(self, config={}):
-        self.configure_segmenter(config)
-
    def to_bytes(self, **kwargs):
        pkuseg_features_b = b""
        pkuseg_weights_b = b""
@ -165,17 +174,11 @@ class ChineseTokenizer(DummyTokenizer):
                sorted(list(self.pkuseg_seg.postprocesser.common_words)),
                sorted(list(self.pkuseg_seg.postprocesser.other_words)),
            )
-        serializers = OrderedDict(
-            (
-                ("cfg", lambda: srsly.json_dumps(self._get_config())),
-                ("pkuseg_features", lambda: pkuseg_features_b),
-                ("pkuseg_weights", lambda: pkuseg_weights_b),
-                (
-                    "pkuseg_processors",
-                    lambda: srsly.msgpack_dumps(pkuseg_processors_data),
-                ),
-            )
-        )
+        serializers = {
+            "pkuseg_features": lambda: pkuseg_features_b,
+            "pkuseg_weights": lambda: pkuseg_weights_b,
+            "pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
+        }
        return util.to_bytes(serializers, [])

    def from_bytes(self, data, **kwargs):
@ -190,14 +193,11 @@ class ChineseTokenizer(DummyTokenizer):
        def deserialize_pkuseg_processors(b):
            pkuseg_data["processors_data"] = srsly.msgpack_loads(b)

-        deserializers = OrderedDict(
-            (
-                ("cfg", lambda b: self._set_config(srsly.json_loads(b))),
-                ("pkuseg_features", deserialize_pkuseg_features),
-                ("pkuseg_weights", deserialize_pkuseg_weights),
-                ("pkuseg_processors", deserialize_pkuseg_processors),
-            )
-        )
+        deserializers = {
+            "pkuseg_features": deserialize_pkuseg_features,
+            "pkuseg_weights": deserialize_pkuseg_weights,
+            "pkuseg_processors": deserialize_pkuseg_processors,
+        }
        util.from_bytes(data, deserializers, [])

        if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
@ -245,13 +245,10 @@ class ChineseTokenizer(DummyTokenizer):
                )
                srsly.write_msgpack(path, data)

-        serializers = OrderedDict(
-            (
-                ("cfg", lambda p: srsly.write_json(p, self._get_config())),
-                ("pkuseg_model", lambda p: save_pkuseg_model(p)),
-                ("pkuseg_processors", lambda p: save_pkuseg_processors(p)),
-            )
-        )
+        serializers = {
+            "pkuseg_model": lambda p: save_pkuseg_model(p),
+            "pkuseg_processors": lambda p: save_pkuseg_processors(p),
+        }
        return util.to_disk(path, serializers, [])

    def from_disk(self, path, **kwargs):
@ -261,7 +258,7 @@ class ChineseTokenizer(DummyTokenizer):
            try:
                import pkuseg
            except ImportError:
-                if self.segmenter == "pkuseg":
+                if self.segmenter == Segmenter.pkuseg:
                    raise ImportError(
                        "pkuseg not installed. To use this model, "
                        + _PKUSEG_INSTALL_MSG
@ -273,9 +270,9 @@ class ChineseTokenizer(DummyTokenizer):
            try:
                import pkuseg
            except ImportError:
-                if self.segmenter == "pkuseg":
+                if self.segmenter == Segmenter.pkuseg:
                    raise ImportError(self._pkuseg_install_msg)
-            if self.segmenter == "pkuseg":
+            if self.segmenter == Segmenter.pkuseg:
                data = srsly.read_msgpack(path)
                (user_dict, do_process, common_words, other_words) = data
                self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
@ -283,35 +280,64 @@ class ChineseTokenizer(DummyTokenizer):
                self.pkuseg_seg.postprocesser.common_words = set(common_words)
                self.pkuseg_seg.postprocesser.other_words = set(other_words)

-        serializers = OrderedDict(
-            (
-                ("cfg", lambda p: self._set_config(srsly.read_json(p))),
-                ("pkuseg_model", lambda p: load_pkuseg_model(p)),
-                ("pkuseg_processors", lambda p: load_pkuseg_processors(p)),
-            )
-        )
+        serializers = {
+            "pkuseg_model": lambda p: load_pkuseg_model(p),
+            "pkuseg_processors": lambda p: load_pkuseg_processors(p),
+        }
        util.from_disk(path, serializers, [])


 class ChineseDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: "zh"
    tokenizer_exceptions = BASE_EXCEPTIONS
-    stop_words = STOP_WORDS
-    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
-
-    @classmethod
-    def create_tokenizer(cls, nlp=None, config={}):
-        return ChineseTokenizer(cls, nlp, config=config)


 class Chinese(Language):
    lang = "zh"
-    Defaults = ChineseDefaults  # override defaults
+    Defaults = ChineseDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)

-    def make_doc(self, text):
-        return self.tokenizer(text)
+
+def try_jieba_import(segmenter: str) -> None:
+    try:
+        import jieba
+
+        if segmenter == Segmenter.jieba:
+            # segment a short text to have jieba initialize its cache in advance
+            list(jieba.cut("作为", cut_all=False))
+
+        return jieba
+    except ImportError:
+        if segmenter == Segmenter.jieba:
+            msg = (
+                "Jieba not installed. To use jieba, install it with `pip "
+                " install jieba` or from https://github.com/fxsjy/jieba"
+            )
+            raise ImportError(msg)
+
+
+def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
+    try:
+        import pkuseg
+
+        if pkuseg_model:
+            return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
+        elif segmenter == Segmenter.pkuseg:
+            msg = (
+                "The Chinese word segmenter is 'pkuseg' but no pkuseg model "
+                "was specified. Please provide the name of a pretrained model "
+                "or the path to a model with:\n"
+                'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
+                "nlp = Chinese.from_config(cfg)"
+            )
+            raise ValueError(msg)
+    except ImportError:
+        if segmenter == Segmenter.pkuseg:
+            msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
+            raise ImportError(msg)
+    except FileNotFoundError:
+        if segmenter == Segmenter.pkuseg:
+            msg = "Unable to load pkuseg model from: " + pkuseg_model
+            raise FileNotFoundError(msg)


 def _get_pkuseg_trie_data(node, path=""):
--- a/spacy/language.py
+++ b/spacy/language.py
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -1,5 +1,14 @@
+from typing import Optional, Callable, List, Dict
+
+from .lookups import Lookups
 from .errors import Errors
 from .parts_of_speech import NAMES as UPOS_NAMES
+from .util import registry, load_language_data, SimpleFrozenDict
+
+
+@registry.lemmatizers("spacy.Lemmatizer.v1")
+def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
+    return Lemmatizer(data_paths=data_paths)


 class Lemmatizer:
@ -14,17 +23,27 @@ class Lemmatizer:
    def load(cls, *args, **kwargs):
        raise NotImplementedError(Errors.E172)

-    def __init__(self, lookups, is_base_form=None):
+    def __init__(
+        self,
+        lookups: Optional[Lookups] = None,
+        data_paths: dict = SimpleFrozenDict(),
+        is_base_form: Optional[Callable] = None,
+    ) -> None:
        """Initialize a Lemmatizer.

        lookups (Lookups): The lookups object containing the (optional) tables
            "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
        RETURNS (Lemmatizer): The newly constructed object.
        """
-        self.lookups = lookups
+        self.lookups = lookups if lookups is not None else Lookups()
+        for name, filename in data_paths.items():
+            data = load_language_data(filename)
+            self.lookups.add_table(name, data)
        self.is_base_form = is_base_form

-    def __call__(self, string, univ_pos, morphology=None):
+    def __call__(
+        self, string: str, univ_pos: str, morphology: Optional[dict] = None
+    ) -> List[str]:
        """Lemmatize a string.

        string (str): The string to lemmatize, e.g. the token text.
@ -39,7 +58,6 @@ class Lemmatizer:
        if isinstance(univ_pos, int):
            univ_pos = UPOS_NAMES.get(univ_pos, "X")
        univ_pos = univ_pos.lower()
-
        if univ_pos in ("", "eol", "space"):
            return [string.lower()]
        # See Issue #435 for example of where this logic is requied.
@ -67,65 +85,31 @@ class Lemmatizer:
        )
        return lemmas

-    def is_base_form(self, univ_pos, morphology=None):
-        """
-        Check whether we're dealing with an uninflected paradigm, so we can
-        avoid lemmatization entirely.
-
-        univ_pos (str / int): The token's universal part-of-speech tag.
-        morphology (dict): The token's morphological features following the
-            Universal Dependencies scheme.
-        """
-        if morphology is None:
-            morphology = {}
-        if univ_pos == "noun" and morphology.get("Number") == "sing":
-            return True
-        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
-            return True
-        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
-        # morphology
-        elif univ_pos == "verb" and (
-            morphology.get("VerbForm") == "fin"
-            and morphology.get("Tense") == "pres"
-            and morphology.get("Number") is None
-        ):
-            return True
-        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
-            return True
-        elif morphology.get("VerbForm") == "inf":
-            return True
-        elif morphology.get("VerbForm") == "none":
-            return True
-        elif morphology.get("Degree") == "pos":
-            return True
-        else:
-            return False
-
-    def noun(self, string, morphology=None):
+    def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "noun", morphology)

-    def verb(self, string, morphology=None):
+    def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "verb", morphology)

-    def adj(self, string, morphology=None):
+    def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "adj", morphology)

-    def det(self, string, morphology=None):
+    def det(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "det", morphology)

-    def pron(self, string, morphology=None):
+    def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "pron", morphology)

-    def adp(self, string, morphology=None):
+    def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "adp", morphology)

-    def num(self, string, morphology=None):
+    def num(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "num", morphology)

-    def punct(self, string, morphology=None):
+    def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "punct", morphology)

-    def lookup(self, string, orth=None):
+    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        """Look up a lemma in the table, if available. If no lemma is found,
        the original string is returned.

@ -141,7 +125,13 @@ class Lemmatizer:
            return lookup_table[key]
        return string

-    def lemmatize(self, string, index, exceptions, rules):
+    def lemmatize(
+        self,
+        string: str,
+        index: Dict[str, List[str]],
+        exceptions: Dict[str, Dict[str, List[str]]],
+        rules: Dict[str, List[List[str]]],
+    ) -> List[str]:
        orig = string
        string = string.lower()
        forms = []
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -1,15 +1,32 @@
+from typing import Dict, Any, List, Union, Optional
+from pathlib import Path
 import srsly
 from preshed.bloom import BloomFilter
 from collections import OrderedDict

 from .errors import Errors
-from .util import SimpleFrozenDict, ensure_path
+from .util import SimpleFrozenDict, ensure_path, registry
 from .strings import get_string_id


 UNSET = object()


+@registry.language_data("spacy-lookups-data")
+def get_lookups(lang: str) -> Dict[str, Any]:
+    """Load the data from the spacy-lookups-data package for a given language,
+    if available. Returns an empty dict if there's no data or if the package
+    is not installed.
+
+    lang (str): The language code (corresponds to entry point exposed by
+        the spacy-lookups-data package).
+    RETURNS (Dict[str, Any]): The lookups, keyed by table name.
+    """
+    if lang in registry.lookups:
+        return registry.lookups.get(lang)
+    return {}
+
+
 class Lookups:
    """Container for large lookup tables and dictionaries, e.g. lemmatization
    data or tokenizer exception lists. Lookups are available via vocab.lookups,
@ -18,7 +35,7 @@ class Lookups:
    via doc.vocab.lookups.
    """

-    def __init__(self):
+    def __init__(self) -> None:
        """Initialize the Lookups object.

        RETURNS (Lookups): The newly created object.
@ -27,7 +44,7 @@ class Lookups:
        """
        self._tables = {}

-    def __contains__(self, name):
+    def __contains__(self, name: str) -> bool:
        """Check if the lookups contain a table of a given name. Delegates to
        Lookups.has_table.

@ -36,16 +53,16 @@ class Lookups:
        """
        return self.has_table(name)

-    def __len__(self):
+    def __len__(self) -> int:
        """RETURNS (int): The number of tables in the lookups."""
        return len(self._tables)

    @property
-    def tables(self):
-        """RETURNS (list): Names of all tables in the lookups."""
+    def tables(self) -> List[str]:
+        """RETURNS (List[str]): Names of all tables in the lookups."""
        return list(self._tables.keys())

-    def add_table(self, name, data=SimpleFrozenDict()):
+    def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table":
        """Add a new table to the lookups. Raises an error if the table exists.

        name (str): Unique name of table.
@ -60,12 +77,12 @@ class Lookups:
        self._tables[name] = table
        return table

-    def get_table(self, name, default=UNSET):
+    def get_table(self, name: str, default: Any = UNSET) -> "Table":
        """Get a table. Raises an error if the table doesn't exist and no
        default value is provided.

        name (str): Name of the table.
-        default: Optional default value to return if table doesn't exist.
+        default (Any): Optional default value to return if table doesn't exist.
        RETURNS (Table): The table.

        DOCS: https://spacy.io/api/lookups#get_table
@ -76,7 +93,7 @@ class Lookups:
            return default
        return self._tables[name]

-    def remove_table(self, name):
+    def remove_table(self, name: str) -> "Table":
        """Remove a table. Raises an error if the table doesn't exist.

        name (str): Name of the table to remove.
@ -88,7 +105,7 @@ class Lookups:
            raise KeyError(Errors.E159.format(name=name, tables=self.tables))
        return self._tables.pop(name)

-    def has_table(self, name):
+    def has_table(self, name: str) -> bool:
        """Check if the lookups contain a table of a given name.

        name (str): Name of the table.
@ -98,7 +115,7 @@ class Lookups:
        """
        return name in self._tables

-    def to_bytes(self, **kwargs):
+    def to_bytes(self, **kwargs) -> bytes:
        """Serialize the lookups to a bytestring.

        RETURNS (bytes): The serialized Lookups.
@ -107,7 +124,7 @@ class Lookups:
        """
        return srsly.msgpack_dumps(self._tables)

-    def from_bytes(self, bytes_data, **kwargs):
+    def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
        """Load the lookups from a bytestring.

        bytes_data (bytes): The data to load.
@ -120,7 +137,9 @@ class Lookups:
            self._tables[key] = Table(key, value)
        return self

-    def to_disk(self, path, filename="lookups.bin", **kwargs):
+    def to_disk(
+        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
+    ) -> None:
        """Save the lookups to a directory as lookups.bin. Expects a path to a
        directory, which will be created if it doesn't exist.

@ -136,7 +155,9 @@ class Lookups:
            with filepath.open("wb") as file_:
                file_.write(self.to_bytes())

-    def from_disk(self, path, filename="lookups.bin", **kwargs):
+    def from_disk(
+        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
+    ) -> "Lookups":
        """Load lookups from a directory containing a lookups.bin. Will skip
        loading if the file doesn't exist.

@ -162,7 +183,7 @@ class Table(OrderedDict):
    """

    @classmethod
-    def from_dict(cls, data, name=None):
+    def from_dict(cls, data: dict, name: Optional[str] = None) -> "Table":
        """Initialize a new table from a dict.

        data (dict): The dictionary.
@ -175,7 +196,7 @@ class Table(OrderedDict):
        self.update(data)
        return self

-    def __init__(self, name=None, data=None):
+    def __init__(self, name: Optional[str] = None, data: Optional[dict] = None) -> None:
        """Initialize a new table.

        name (str): Optional table name for reference.
@ -193,7 +214,7 @@ class Table(OrderedDict):
        if data:
            self.update(data)

-    def __setitem__(self, key, value):
+    def __setitem__(self, key: Union[str, int], value: Any) -> None:
        """Set new key/value pair. String keys will be hashed.

        key (str / int): The key to set.
@ -203,7 +224,7 @@ class Table(OrderedDict):
        OrderedDict.__setitem__(self, key, value)
        self.bloom.add(key)

-    def set(self, key, value):
+    def set(self, key: Union[str, int], value: Any) -> None:
        """Set new key/value pair. String keys will be hashed.
        Same as table[key] = value.

@ -212,7 +233,7 @@ class Table(OrderedDict):
        """
        self[key] = value

-    def __getitem__(self, key):
+    def __getitem__(self, key: Union[str, int]) -> Any:
        """Get the value for a given key. String keys will be hashed.

        key (str / int): The key to get.
@ -221,7 +242,7 @@ class Table(OrderedDict):
        key = get_string_id(key)
        return OrderedDict.__getitem__(self, key)

-    def get(self, key, default=None):
+    def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
        """Get the value for a given key. String keys will be hashed.

        key (str / int): The key to get.
@ -231,7 +252,7 @@ class Table(OrderedDict):
        key = get_string_id(key)
        return OrderedDict.get(self, key, default)

-    def __contains__(self, key):
+    def __contains__(self, key: Union[str, int]) -> bool:
        """Check whether a key is in the table. String keys will be hashed.

        key (str / int): The key to check.
@ -243,7 +264,7 @@ class Table(OrderedDict):
            return False
        return OrderedDict.__contains__(self, key)

-    def to_bytes(self):
+    def to_bytes(self) -> bytes:
        """Serialize table to a bytestring.

        RETURNS (bytes): The serialized table.
@ -257,7 +278,7 @@ class Table(OrderedDict):
        }
        return srsly.msgpack_dumps(data)

-    def from_bytes(self, bytes_data):
+    def from_bytes(self, bytes_data: bytes) -> "Table":
        """Load a table from a bytestring.

        bytes_data (bytes): The data to load.
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -26,8 +26,8 @@ def build_nel_encoder(tok2vec, nO=None):


@registry.assets.register("spacy.KBFromFile.v1")
-def load_kb(nlp_path, kb_path) -> KnowledgeBase:
-    vocab = Vocab().from_disk(Path(nlp_path) / "vocab")
+def load_kb(vocab_path, kb_path) -> KnowledgeBase:
+    vocab = Vocab().from_disk(vocab_path)
    kb = KnowledgeBase(vocab=vocab)
    kb.load_bulk(kb_path)
    return kb
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -1,30 +1,9 @@
-from thinc.api import (
-    Model,
-    reduce_mean,
-    Linear,
-    list2ragged,
-    Logistic,
-    ParametricAttention,
-)
-from thinc.api import chain, concatenate, clone, Dropout
-from thinc.api import (
-    SparseLinear,
-    Softmax,
-    softmax_activation,
-    Maxout,
-    reduce_sum,
-    Relu,
-    residual,
-    expand_window,
-)
-from thinc.api import (
-    HashEmbed,
-    with_ragged,
-    with_array,
-    with_cpu,
-    uniqued,
-    FeatureExtractor,
-)
+from typing import Optional
+from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
+from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
+from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
+from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued
+from thinc.api import Relu, residual, expand_window, FeatureExtractor

 from ..spacy_vectors import SpacyVectors
 from ... import util
@ -34,7 +13,9 @@ from ..extract_ngrams import extract_ngrams


@registry.architectures.register("spacy.TextCatCNN.v1")
-def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
+def build_simple_cnn_text_classifier(
+    tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
+) -> Model:
    """
    Build a simple CNN text classifier, given a token-to-vector model as inputs.
    If exclusive_classes=True, a softmax non-linearity is applied, so that the
@ -90,13 +71,25 @@ def build_text_classifier(
            nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
        )
        prefix = HashEmbed(
-            nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11
+            nO=width // 2,
+            nV=embed_size,
+            column=cols.index(PREFIX),
+            dropout=dropout,
+            seed=11,
        )
        suffix = HashEmbed(
-            nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12
+            nO=width // 2,
+            nV=embed_size,
+            column=cols.index(SUFFIX),
+            dropout=dropout,
+            seed=12,
        )
        shape = HashEmbed(
-            nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13
+            nO=width // 2,
+            nV=embed_size,
+            column=cols.index(SHAPE),
+            dropout=dropout,
+            seed=13,
        )

        width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -11,8 +11,8 @@ from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE


@registry.architectures.register("spacy.Tok2VecTensors.v1")
-def tok2vec_tensors_v1(width):
-    tok2vec = Tok2VecListener("tok2vec", width=width)
+def tok2vec_tensors_v1(width, upstream="*"):
+    tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
    return tok2vec


--- a/spacy/pipe_analysis.py
+++ b/spacy/pipe_analysis.py
@ -1,30 +1,37 @@
+from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
 from wasabi import Printer
 import warnings

 from .tokens import Doc, Token, Span
 from .errors import Errors, Warnings
+from .util import dot_to_dict
+
+if TYPE_CHECKING:
+    # This lets us add type hints for mypy etc. without causing circular imports
+    from .language import Language  # noqa: F401


-def analyze_pipes(pipeline, name, pipe, index, warn=True):
+def analyze_pipes(
+    nlp: "Language", name: str, index: int, warn: bool = True
+) -> List[str]:
    """Analyze a pipeline component with respect to its position in the current
    pipeline and the other components. Will check whether requirements are
    fulfilled (e.g. if previous components assign the attributes).

-    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    nlp (Language): The current nlp object.
    name (str): The name of the pipeline component to analyze.
-    pipe (callable): The pipeline component function to analyze.
    index (int): The index of the component in the pipeline.
    warn (bool): Show user warning if problem is found.
-    RETURNS (list): The problems found for the given pipeline component.
+    RETURNS (List[str]): The problems found for the given pipeline component.
    """
-    assert pipeline[index][0] == name
-    prev_pipes = pipeline[:index]
-    pipe_requires = getattr(pipe, "requires", [])
-    requires = {annot: False for annot in pipe_requires}
+    assert nlp.pipeline[index][0] == name
+    prev_pipes = nlp.pipeline[:index]
+    meta = nlp.get_pipe_meta(name)
+    requires = {annot: False for annot in meta.requires}
    if requires:
        for prev_name, prev_pipe in prev_pipes:
-            prev_assigns = getattr(prev_pipe, "assigns", [])
-            for annot in prev_assigns:
+            prev_meta = nlp.get_pipe_meta(prev_name)
+            for annot in prev_meta.assigns:
                requires[annot] = True
    problems = []
    for annot, fulfilled in requires.items():
@ -35,46 +42,29 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
    return problems


-def analyze_all_pipes(pipeline, warn=True):
+def analyze_all_pipes(nlp: "Language", warn: bool = True) -> Dict[str, List[str]]:
    """Analyze all pipes in the pipeline in order.

-    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    nlp (Language): The current nlp object.
    warn (bool): Show user warning if problem is found.
-    RETURNS (dict): The problems found, keyed by component name.
+    RETURNS (Dict[str, List[str]]): The problems found, keyed by component name.
    """
    problems = {}
-    for i, (name, pipe) in enumerate(pipeline):
-        problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
+    for i, name in enumerate(nlp.pipe_names):
+        problems[name] = analyze_pipes(nlp, name, i, warn=warn)
    return problems


-def dot_to_dict(values):
-    """Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
-    become {"token": {"pos": True, "_": {"xyz": True }}}.
-
-    values (iterable): The values to convert.
-    RETURNS (dict): The converted values.
-    """
-    result = {}
-    for value in values:
-        path = result
-        parts = value.lower().split(".")
-        for i, item in enumerate(parts):
-            is_last = i == len(parts) - 1
-            path = path.setdefault(item, True if is_last else {})
-    return result
-
-
-def validate_attrs(values):
+def validate_attrs(values: Iterable[str]) -> Iterable[str]:
    """Validate component attributes provided to "assigns", "requires" etc.
    Raises error for invalid attributes and formatting. Doesn't check if
    custom extension attributes are registered, since this is something the
    user might want to do themselves later in the component.

-    values (iterable): The string attributes to check, e.g. `["token.pos"]`.
-    RETURNS (iterable): The checked attributes.
+    values (Iterable[str]): The string attributes to check, e.g. `["token.pos"]`.
+    RETURNS (Iterable[str]): The checked attributes.
    """
-    data = dot_to_dict(values)
+    data = dot_to_dict({value: True for value in values})
    objs = {"doc": Doc, "token": Token, "span": Span}
    for obj_key, attrs in data.items():
        if obj_key == "span":
@ -111,37 +101,40 @@ def validate_attrs(values):
    return values


-def _get_feature_for_attr(pipeline, attr, feature):
+def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]:
    assert feature in ["assigns", "requires"]
    result = []
-    for pipe_name, pipe in pipeline:
-        pipe_assigns = getattr(pipe, feature, [])
+    for pipe_name in nlp.pipe_names:
+        meta = nlp.get_pipe_meta(pipe_name)
+        pipe_assigns = getattr(meta, feature, [])
        if attr in pipe_assigns:
-            result.append((pipe_name, pipe))
+            result.append(pipe_name)
    return result


-def get_assigns_for_attr(pipeline, attr):
+def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]:
    """Get all pipeline components that assign an attr, e.g. "doc.tensor".

-    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    pipeline (Language): The current nlp object.
    attr (str): The attribute to check.
-    RETURNS (list): (name, pipeline) tuples of components that assign the attr.
+    RETURNS (List[str]): Names of components that require the attr.
    """
-    return _get_feature_for_attr(pipeline, attr, "assigns")
+    return _get_feature_for_attr(nlp, attr, "assigns")


-def get_requires_for_attr(pipeline, attr):
+def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]:
    """Get all pipeline components that require an attr, e.g. "doc.tensor".

-    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    pipeline (Language): The current nlp object.
    attr (str): The attribute to check.
-    RETURNS (list): (name, pipeline) tuples of components that require the attr.
+    RETURNS (List[str]): Names of components that require the attr.
    """
-    return _get_feature_for_attr(pipeline, attr, "requires")
+    return _get_feature_for_attr(nlp, attr, "requires")


-def print_summary(nlp, pretty=True, no_print=False):
+def print_summary(
+    nlp: "Language", pretty: bool = True, no_print: bool = False
+) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
    """Print a formatted summary for the current nlp object's pipeline. Shows
    a table with the pipeline components and why they assign and require, as
    well as any problems if available.
@ -154,12 +147,10 @@ def print_summary(nlp, pretty=True, no_print=False):
    msg = Printer(pretty=pretty, no_print=no_print)
    overview = []
    problems = {}
-    for i, (name, pipe) in enumerate(nlp.pipeline):
-        requires = getattr(pipe, "requires", [])
-        assigns = getattr(pipe, "assigns", [])
-        retok = getattr(pipe, "retokenizes", False)
-        overview.append((i, name, requires, assigns, retok))
-        problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
+    for i, name in enumerate(nlp.pipe_names):
+        meta = nlp.get_pipe_meta(name)
+        overview.append((i, name, meta.requires, meta.assigns, meta.retokenizes))
+        problems[name] = analyze_pipes(nlp, name, i, warn=False)
    msg.divider("Pipeline Overview")
    header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
    msg.table(overview, header=header, divider=True, multiline=True)
@ -175,15 +166,19 @@ def print_summary(nlp, pretty=True, no_print=False):
        return {"overview": overview, "problems": problems}


-def count_pipeline_interdependencies(pipeline):
+def count_pipeline_interdependencies(nlp: "Language") -> List[int]:
    """Count how many subsequent components require an annotation set by each
    component in the pipeline.
+
+    nlp (Language): The current nlp object.
+    RETURNS (List[int]): The interdependency counts.
    """
    pipe_assigns = []
    pipe_requires = []
-    for name, pipe in pipeline:
-        pipe_assigns.append(set(getattr(pipe, "assigns", [])))
-        pipe_requires.append(set(getattr(pipe, "requires", [])))
+    for name in nlp.pipe_names:
+        meta = nlp.get_pipe_meta(name)
+        pipe_assigns.append(set(meta.assigns))
+        pipe_requires.append(set(meta.requires))
    counts = []
    for i, assigns in enumerate(pipe_assigns):
        count = 0
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -1,28 +1,33 @@
-from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
-from .pipes import TextCategorizer, Pipe, Sentencizer
-from .pipes import SentenceRecognizer
-from .simple_ner import SimpleNER
-from .morphologizer import Morphologizer
+from .dep_parser import DependencyParser
+from .entity_linker import EntityLinker
+from .ner import EntityRecognizer
 from .entityruler import EntityRuler
+from .morphologizer import Morphologizer
+from .pipe import Pipe
+from spacy.pipeline.senter import SentenceRecognizer
+from .sentencizer import Sentencizer
+from .simple_ner import SimpleNER
+from .tagger import Tagger
+from .textcat import TextCategorizer
 from .tok2vec import Tok2Vec
 from .hooks import SentenceSegmenter, SimilarityHook
 from .functions import merge_entities, merge_noun_chunks, merge_subtokens

 __all__ = [
-    "Tagger",
    "DependencyParser",
-    "EntityRecognizer",
    "EntityLinker",
-    "TextCategorizer",
-    "Tok2Vec",
-    "Pipe",
-    "Morphologizer",
+    "EntityRecognizer",
    "EntityRuler",
-    "Sentencizer",
-    "SentenceSegmenter",
+    "Morphologizer",
+    "Pipe",
    "SentenceRecognizer",
+    "SentenceSegmenter",
+    "Sentencizer",
    "SimilarityHook",
    "SimpleNER",
+    "Tagger",
+    "TextCategorizer",
+    "Tok2Vec",
    "merge_entities",
    "merge_noun_chunks",
    "merge_subtokens",
--- a/spacy/pipeline/defaults/init.py
+++ b/spacy/pipeline/defaults/init.py
@ -1,93 +0,0 @@
-from pathlib import Path
-
-from ... import util
-
-
-def default_nel_config():
-    loc = Path(__file__).parent / "entity_linker_defaults.cfg"
-    return util.load_config(loc, create_objects=False)
-
-
-def default_nel():
-    loc = Path(__file__).parent / "entity_linker_defaults.cfg"
-    return util.load_config(loc, create_objects=True)["model"]
-
-
-def default_morphologizer_config():
-    loc = Path(__file__).parent / "morphologizer_defaults.cfg"
-    return util.load_config(loc, create_objects=False)
-
-
-def default_morphologizer():
-    loc = Path(__file__).parent / "morphologizer_defaults.cfg"
-    return util.load_config(loc, create_objects=True)["model"]
-
-
-def default_parser_config():
-    loc = Path(__file__).parent / "parser_defaults.cfg"
-    return util.load_config(loc, create_objects=False)
-
-
-def default_parser():
-    loc = Path(__file__).parent / "parser_defaults.cfg"
-    return util.load_config(loc, create_objects=True)["model"]
-
-
-def default_ner_config():
-    loc = Path(__file__).parent / "ner_defaults.cfg"
-    return util.load_config(loc, create_objects=False)
-
-
-def default_ner():
-    loc = Path(__file__).parent / "ner_defaults.cfg"
-    return util.load_config(loc, create_objects=True)["model"]
-
-
-def default_senter_config():
-    loc = Path(__file__).parent / "senter_defaults.cfg"
-    return util.load_config(loc, create_objects=False)
-
-
-def default_senter():
-    loc = Path(__file__).parent / "senter_defaults.cfg"
-    return util.load_config(loc, create_objects=True)["model"]
-
-
-def default_tagger_config():
-    loc = Path(__file__).parent / "tagger_defaults.cfg"
-    return util.load_config(loc, create_objects=False)
-
-
-def default_tagger():
-    loc = Path(__file__).parent / "tagger_defaults.cfg"
-    return util.load_config(loc, create_objects=True)["model"]
-
-
-def default_textcat_config():
-    loc = Path(__file__).parent / "textcat_defaults.cfg"
-    return util.load_config(loc, create_objects=False)
-
-
-def default_textcat():
-    loc = Path(__file__).parent / "textcat_defaults.cfg"
-    return util.load_config(loc, create_objects=True)["model"]
-
-
-def default_tok2vec_config():
-    loc = Path(__file__).parent / "tok2vec_defaults.cfg"
-    return util.load_config(loc, create_objects=False)
-
-
-def default_tok2vec():
-    loc = Path(__file__).parent / "tok2vec_defaults.cfg"
-    return util.load_config(loc, create_objects=True)["model"]
-
-
-def default_simple_ner_config():
-    loc = Path(__file__).parent / "simple_ner_defaults.cfg"
-    return util.load_config(loc, create_objects=False)
-
-
-def default_simple_ner():
-    loc = Path(__file__).parent / "simple_ner_defaults.cfg"
-    return util.load_config(loc, create_objects=True)["model"]
--- a/spacy/pipeline/defaults/entity_linker_defaults.cfg
+++ b/spacy/pipeline/defaults/entity_linker_defaults.cfg
@ -1,13 +0,0 @@
-[model]
-@architectures = "spacy.EntityLinker.v1"
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = null
-width = 96
-depth = 2
-embed_size = 300
-window_size = 1
-maxout_pieces = 3
-subword_features = true
-dropout = null
--- a/spacy/pipeline/defaults/morphologizer_defaults.cfg
+++ b/spacy/pipeline/defaults/morphologizer_defaults.cfg
@ -1,14 +0,0 @@
-[model]
-@architectures = "spacy.Tagger.v1"
-
-[model.tok2vec]
-@architectures = "spacy.HashCharEmbedCNN.v1"
-pretrained_vectors = null
-width = 128
-depth = 4
-embed_size = 7000
-window_size = 1
-maxout_pieces = 3
-nM = 64
-nC = 8
-dropout = null
--- a/spacy/pipeline/defaults/multitask_defaults.cfg
+++ b/spacy/pipeline/defaults/multitask_defaults.cfg
@ -1,15 +0,0 @@
-[model]
-@architectures = "spacy.MultiTask.v1"
-maxout_pieces = 3
-token_vector_width = 96
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = null
-width = 96
-depth = 4
-embed_size = 2000
-window_size = 1
-maxout_pieces = 2
-subword_features = true
-dropout = null
--- a/spacy/pipeline/defaults/ner_defaults.cfg
+++ b/spacy/pipeline/defaults/ner_defaults.cfg
@ -1,16 +0,0 @@
-[model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 6
-hidden_width = 64
-maxout_pieces = 2
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = null
-width = 96
-depth = 4
-embed_size = 2000
-window_size = 1
-maxout_pieces = 3
-subword_features = true
-dropout = null
--- a/Show More
+++ b/Show More