Refactor pipeline components, config and language data (#5759)

* Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2025-11-04 09:57:26 +03:00 · 2020-07-22 13:42:59 +02:00 · 2020-07-22 13:42:59 +02:00 · 43b960c01b
commit 43b960c01b
parent 311d0bde29
179 changed files with 6946 additions and 4619 deletions
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@ -17,7 +17,6 @@ import plac
 import random
 from pathlib import Path
 import spacy
 from spacy.kb import KnowledgeBase
 from spacy.gold import Example
 from spacy.pipeline import EntityRuler
@ -82,12 +81,16 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50):
    # Create the Entity Linker component and add it to the pipeline.
    if "entity_linker" not in nlp.pipe_names:
-        kb = KnowledgeBase(vocab=nlp.vocab)
+        print("Loading Knowledge Base from '%s'" % kb_path)
-        kb.load_bulk(kb_path)
+        cfg = {
-        print("Loaded Knowledge Base from '%s'" % kb_path)
+            "kb": {
-
+                "@assets": "spacy.KBFromFile.v1",
                "vocab_path": vocab_path,
                "kb_path": kb_path,
            },
            # use only the predicted EL score and not the prior probability (for demo purposes)
-        cfg = {"kb": kb, "incl_prior": False}
+            "incl_prior": False,
        }
        entity_linker = nlp.create_pipe("entity_linker", cfg)
        nlp.add_pipe(entity_linker, last=True)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a18,<8.0.0a20",
+    "thinc>=8.0.0a19,<8.0.0a30",
    "blis>=0.4.0,<0.5.0",
    "pytokenizations"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -1,11 +1,11 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a18,<8.0.0a20
+thinc>=8.0.0a19,<8.0.0a30
 blis>=0.4.0,<0.5.0
 ml_datasets>=0.1.1
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.7.0,<1.1.0
+wasabi>=0.7.1,<1.1.0
 srsly>=2.1.0,<3.0.0
 catalogue>=0.0.7,<1.1.0
 typer>=0.3.0,<0.4.0
--- a/setup.cfg
+++ b/setup.cfg
@ -34,15 +34,15 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a18,<8.0.0a20
+    thinc>=8.0.0a19,<8.0.0a30
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a18,<8.0.0a20
+    thinc>=8.0.0a19,<8.0.0a30
    blis>=0.4.0,<0.5.0
-    wasabi>=0.7.0,<1.1.0
+    wasabi>=0.7.1,<1.1.0
    srsly>=2.1.0,<3.0.0
    catalogue>=0.0.7,<1.1.0
    typer>=0.3.0,<0.4.0
--- a/setup.py
+++ b/setup.py
@ -32,8 +32,14 @@ MOD_NAMES = [
    "spacy.attrs",
    "spacy.kb",
    "spacy.morphology",
-    "spacy.pipeline.pipes",
+    "spacy.pipeline.dep_parser",
    "spacy.pipeline.morphologizer",
    "spacy.pipeline.multitask",
    "spacy.pipeline.ner",
    "spacy.pipeline.pipe",
    "spacy.pipeline.sentencizer",
    "spacy.pipeline.senter",
    "spacy.pipeline.tagger",
    "spacy.syntax.stateclass",
    "spacy.syntax._state",
    "spacy.tokenizer",
--- a/spacy/init.py
+++ b/spacy/init.py
@ -14,7 +14,6 @@ from .about import __version__
 from .errors import Errors, Warnings
 from . import util
 from .util import registry
 from .language import component
 if sys.maxunicode == 65535:
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -63,7 +63,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
    result = {}
    while args:
        opt = args.pop(0)
-        err = f"Invalid config override '{opt}'"
+        err = f"Invalid CLI argument '{opt}'"
        if opt.startswith("--"):  # new argument
            opt = opt.replace("--", "").replace("-", "_")
            if "." not in opt:
@ -73,7 +73,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
            else:
                value = args.pop(0)
            # Just like we do in the config, we're calling json.loads on the
-            # values. But since they come from the CLI, it'd b unintuitive to
+            # values. But since they come from the CLI, it'd be unintuitive to
            # explicitly mark strings with escaped quotes. So we're working
            # around that here by falling back to a string if parsing fails.
            # TODO: improve logic to handle simple types like list of strings?
@ -82,7 +82,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
            except ValueError:
                result[opt] = str(value)
        else:
-            msg.fail(f"{err}: options need to start with --", exits=1)
+            msg.fail(f"{err}: override option should start with --", exits=1)
    return result
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -3,12 +3,12 @@ from pathlib import Path
 from collections import Counter
 import sys
 import srsly
-from wasabi import Printer, MESSAGES, msg
+from wasabi import Printer, MESSAGES, msg, diff_strings
 import typer
 from thinc.api import Config
 from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
 from ._util import import_code, debug_cli
 from ..schemas import ConfigSchema
 from ..gold import Corpus, Example
 from ..syntax import nonproj
 from ..language import Language
@ -33,6 +33,9 @@ def debug_config_cli(
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
    auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
    diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
    # fmt: on
 ):
    """Debug a config.cfg file and show validation errors. The command will
@ -40,14 +43,37 @@ def debug_config_cli(
    validation errors are blocking and will prevent the rest of the config from
    being resolved. This means that you may not see all validation errors at
    once and some issues are only shown once previous errors have been fixed.
    Similar as with the 'train' command, you can override settings from the config
    as command line options. For instance, --training.batch_size 128 overrides
    the value of "batch_size" in the block "[training]".
    """
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    with show_validation_error():
-        util.load_config(
+        config = Config().from_disk(config_path)
-            config_path, create_objects=False, schema=ConfigSchema, overrides=overrides,
+        try:
            nlp, _ = util.load_model_from_config(
                config, overrides=overrides, auto_fill=auto_fill
            )
-    msg.good("Config is valid")
+        except ValueError as e:
            msg.fail(str(e), exits=1)
    is_stdout = output_path is not None and str(output_path) == "-"
    if auto_fill:
        orig_config = config.to_str()
        filled_config = nlp.config.to_str()
        if orig_config == filled_config:
            msg.good("Original config is valid, no values were auto-filled")
        else:
            msg.good("Auto-filled config is valid")
            if diff:
                print(diff_strings(config.to_str(), nlp.config.to_str()))
    else:
        msg.good("Original config is valid", show=not is_stdout)
    if is_stdout:
        print(nlp.config.to_str())
    elif output_path is not None:
        nlp.config.to_disk(output_path)
        msg.good(f"Saved updated config to {output_path}")
@debug_cli.command(
@ -117,16 +143,13 @@ def debug_data(
    if not config_path.exists():
        msg.fail("Config file not found", config_path, exists=1)
    with show_validation_error():
-        config = util.load_config(
+        cfg = Config().from_disk(config_path)
-            config_path,
+        nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
-            create_objects=False,
+    # TODO: handle base model
            schema=ConfigSchema,
            overrides=config_overrides,
        )
    nlp = util.load_model_from_config(config["nlp"])
    lang = config["nlp"]["lang"]
-    base_model = config["nlp"]["base_model"]
+    base_model = config["training"]["base_model"]
-    pipeline = list(config["nlp"]["pipeline"].keys())
+    pipeline = nlp.pipe_names
    factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
    tag_map_path = util.ensure_path(config["training"]["tag_map"])
    tag_map = {}
    if tag_map_path is not None:
@ -164,19 +187,17 @@ def debug_data(
    msg.good("Corpus is loadable")
    # Create all gold data here to avoid iterating over the train_dataset constantly
-    gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
+    gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
    gold_train_unpreprocessed_data = _compile_gold(
-        train_dataset, pipeline, nlp, make_proj=False
+        train_dataset, factory_names, nlp, make_proj=False
    )
-    gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
+    gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True)
    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]
    msg.divider("Training stats")
    msg.text(f"Training pipeline: {', '.join(pipeline)}")
    for pipe in [p for p in pipeline if p not in nlp.factories]:
        msg.fail(f"Pipeline component '{pipe}' not available in factories")
    if base_model:
        msg.text(f"Starting with base model '{base_model}'")
    else:
@ -244,7 +265,7 @@ def debug_data(
    else:
        msg.info("No word vectors present in the model")
-    if "ner" in pipeline:
+    if "ner" in factory_names:
        # Get all unique NER labels present in the data
        labels = set(
            label for label in gold_train_data["ner"] if label not in ("O", "-", None)
@ -332,7 +353,7 @@ def debug_data(
                "with punctuation can not be trained with a noise level > 0."
            )
-    if "textcat" in pipeline:
+    if "textcat" in factory_names:
        msg.divider("Text Classification")
        labels = [label for label in gold_train_data["cats"]]
        model_labels = _get_labels_from_model(nlp, "textcat")
@ -379,7 +400,7 @@ def debug_data(
                    "contains only instances with mutually-exclusive classes."
                )
-    if "tagger" in pipeline:
+    if "tagger" in factory_names:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_train_data["tags"]]
        tag_map = nlp.vocab.morphology.tag_map
@ -394,7 +415,7 @@ def debug_data(
        for label in non_tagmap:
            msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
-    if "parser" in pipeline:
+    if "parser" in factory_names:
        has_low_data_warning = False
        msg.divider("Dependency Parsing")
@ -541,7 +562,10 @@ def _load_file(file_path: Path, msg: Printer) -> None:
 def _compile_gold(
-    examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool
+    examples: Sequence[Example],
    factory_names: List[str],
    nlp: Language,
    make_proj: bool,
 ) -> Dict[str, Any]:
    data = {
        "ner": Counter(),
@ -573,7 +597,7 @@ def _compile_gold(
            for word in valid_words:
                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
                    data["words_missing_vectors"].update([word])
-        if "ner" in pipeline:
+        if "ner" in factory_names:
            for i, label in enumerate(eg.get_aligned_ner()):
                if label is None:
                    continue
@ -595,14 +619,14 @@ def _compile_gold(
                    data["ner"][combined_label] += 1
                elif label == "-":
                    data["ner"]["-"] += 1
-        if "textcat" in pipeline:
+        if "textcat" in factory_names:
            data["cats"].update(gold.cats)
            if list(gold.cats.values()).count(1.0) != 1:
                data["n_cats_multilabel"] += 1
-        if "tagger" in pipeline:
+        if "tagger" in factory_names:
            tags = eg.get_aligned("TAG", as_string=True)
            data["tags"].update([x for x in tags if x is not None])
-        if "parser" in pipeline:
+        if "parser" in factory_names:
            aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
            data["deps"].update([x for x in aligned_deps if x is not None])
            for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -1,8 +1,11 @@
 from typing import Dict, Any, Optional
 from pathlib import Path
 from wasabi import msg
-from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
+from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config
 from thinc.api import Model
 import typer
-from ._util import Arg, Opt, debug_cli
+from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
 from .. import util
 from ..lang.en import English
@ -10,8 +13,10 @@ from ..lang.en import English
@debug_cli.command("model")
 def debug_model_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True),
-    layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"),
+    section: str = Arg(..., help="Section that defines the model to be analysed"),
    layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
    dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
    parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
    gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
@ -20,14 +25,18 @@ def debug_model_cli(
    P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
    P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
    P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
-    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
+    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU")
    seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
    # fmt: on
 ):
    """
    Analyze a Thinc model implementation. Includes checks for internal structure
    and activations during training.
    """
    if use_gpu >= 0:
        msg.info("Using GPU")
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
    print_settings = {
        "dimensions": dimensions,
        "parameters": parameters,
@ -39,27 +48,47 @@ def debug_model_cli(
        "print_after_training": P2,
        "print_prediction": P3,
    }
-
+    config_overrides = parse_config_overrides(ctx.args)
    cfg = Config().from_disk(config_path)
    with show_validation_error():
        try:
            _, config = util.load_model_from_config(cfg, overrides=config_overrides)
        except ValueError as e:
            msg.fail(str(e), exits=1)
    seed = config["pretraining"]["seed"]
    if seed is not None:
        msg.info(f"Fixing random seed: {seed}")
        fix_random_seed(seed)
    if use_gpu >= 0:
        msg.info(f"Using GPU: {use_gpu}")
        require_gpu(use_gpu)
    else:
        msg.info(f"Using CPU")
-    debug_model(
+    component = config
-        config_path, print_settings=print_settings,
+    parts = section.split(".")
    for item in parts:
        try:
            component = component[item]
        except KeyError:
            msg.fail(
                f"The section '{section}' is not a valid section in the provided config.",
                exits=1,
            )
    if hasattr(component, "model"):
        model = component.model
    else:
        msg.fail(
            f"The section '{section}' does not specify an object that holds a Model.",
            exits=1,
        )
    debug_model(model, print_settings=print_settings)
-def debug_model(config_path: Path, *, print_settings=None):
+def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
    if not isinstance(model, Model):
        msg.fail(
            f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
            exits=1,
        )
    if print_settings is None:
        print_settings = {}
    model = util.load_config(config_path, create_objects=True)["model"]
    # STEP 0: Printing before training
    msg.info(f"Analysing model with ID {model.id}")
    if print_settings.get("print_before_training"):
@ -67,7 +96,9 @@ def debug_model(config_path: Path, *, print_settings=None):
        _print_model(model, print_settings)
    # STEP 1: Initializing the model and printing again
-    model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp))
+    Y = _get_output(model.ops.xp)
    _set_output_dim(nO=Y.shape[-1], model=model)
    model.initialize(X=_get_docs(), Y=Y)
    if print_settings.get("print_after_init"):
        msg.info(f"After initialization:")
        _print_model(model, print_settings)
@ -110,12 +141,16 @@ def _get_docs():
 def _get_output(xp):
-    return xp.asarray(
+    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
-        [
+
-            xp.asarray([i + 10, i + 20, i + 30], dtype="float32")
+
-            for i, _ in enumerate(_get_docs())
+def _set_output_dim(model, nO):
-        ]
+    # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
-    )
+    if model.has_dim("nO") is None:
        model.set_dim("nO", nO)
    if model.has_ref("output_layer"):
        if model.get_ref("output_layer").has_dim("nO") is None:
            model.get_ref("output_layer").set_dim("nO", nO)
 def _print_model(model, print_settings):
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -105,9 +105,10 @@ def evaluate(
        print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
    if displacy_path:
        factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
        docs = [ex.predicted for ex in dev_dataset]
-        render_deps = "parser" in nlp.meta.get("pipeline", [])
+        render_deps = "parser" in factory_names
-        render_ents = "ner" in nlp.meta.get("pipeline", [])
+        render_ents = "ner" in factory_names
        render_parses(
            docs,
            displacy_path,
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -84,7 +84,6 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path)
    if model_path.resolve() != model_path:
        meta["link"] = str(model_path)
        meta["source"] = str(model_path.resolve())
    else:
        meta["source"] = str(model_path)
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -125,7 +125,6 @@ def get_meta(
    meta.update(existing_meta)
    nlp = util.load_model_from_path(Path(model_path))
    meta["spacy_version"] = util.get_model_version_range(about.__version__)
    meta["pipeline"] = nlp.pipe_names
    meta["vectors"] = {
        "width": nlp.vocab.vectors_length,
        "vectors": len(nlp.vocab.vectors),
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -5,7 +5,7 @@ import time
 import re
 from collections import Counter
 from pathlib import Path
-from thinc.api import use_pytorch_for_gpu_memory, require_gpu
+from thinc.api import use_pytorch_for_gpu_memory, require_gpu, Config
 from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
 from thinc.api import CosineDistance, L2Distance
 from wasabi import msg
@ -15,7 +15,6 @@ import typer
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code
 from ..schemas import ConfigSchema
 from ..errors import Errors
 from ..ml.models.multi_task import build_cloze_multi_task_model
 from ..ml.models.multi_task import build_cloze_characters_multi_task_model
@ -37,6 +36,7 @@ def pretrain_cli(
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
    # fmt: on
 ):
    """
@ -67,6 +67,7 @@ def pretrain_cli(
        config_overrides=overrides,
        resume_path=resume_path,
        epoch_resume=epoch_resume,
        use_gpu=use_gpu,
    )
@ -77,40 +78,29 @@ def pretrain(
    config_overrides: Dict[str, Any] = {},
    resume_path: Optional[Path] = None,
    epoch_resume: Optional[int] = None,
    use_gpu: int = -1,
 ):
    verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
    msg.info(f"Loading config from: {config_path}")
    with show_validation_error():
        config = util.load_config(
            config_path,
            create_objects=False,
            validate=True,
            schema=ConfigSchema,
            overrides=config_overrides,
        )
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good(f"Created output directory: {output_dir}")
    use_gpu = config["training"]["use_gpu"]
    if use_gpu >= 0:
        msg.info("Using GPU")
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
-
+    msg.info(f"Loading config from: {config_path}")
    config = Config().from_disk(config_path)
    with show_validation_error():
        nlp, config = util.load_model_from_config(config, overrides=config_overrides)
    # TODO: validate that [pretraining] block exists
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good(f"Created output directory: {output_dir}")
    seed = config["pretraining"]["seed"]
    if seed is not None:
        fix_random_seed(seed)
    if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
        use_pytorch_for_gpu_memory()
-
+    config.to_disk(output_dir / "config.cfg")
    nlp_config = config["nlp"]
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved config file in the output directory")
    config = util.load_config(config_path, create_objects=True)
    nlp = util.load_model_from_config(nlp_config)
    pretrain_config = config["pretraining"]
    if texts_loc != "-":  # reading from a file
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -25,7 +25,7 @@ def profile_cli(
    # fmt: on
 ):
    """
-    Profile a spaCy pipeline, to find out which functions take the most time.
+    Profile which functions take the most time in a spaCy pipeline.
    Input should be formatted as one JSON object per line with a key "text".
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,4 +1,4 @@
-from typing import Optional, Dict, Any
+from typing import Optional, Dict, Any, Tuple, Union, Callable, List
 from timeit import default_timer as timer
 import srsly
 import tqdm
@ -7,6 +7,7 @@ from wasabi import msg
 import thinc
 import thinc.schedules
 from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
 from thinc.api import Config, Optimizer
 import random
 import typer
@ -14,18 +15,15 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code
 from ..gold import Corpus, Example
 from ..lookups import Lookups
 from ..language import Language
 from .. import util
 from ..errors import Errors
 from ..schemas import ConfigSchema
 # Don't remove - required to load the built-in architectures
 from ..ml import models  # noqa: F401
 registry = util.registry
@app.command(
    "train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
 )
@ -38,6 +36,8 @@ def train_cli(
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
    resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
    # fmt: on
 ):
    """
@ -53,9 +53,7 @@ def train_cli(
    referenced in the config.
    """
    util.set_env_log(verbose)
-    verify_cli_args(
+    verify_cli_args(train_path, dev_path, config_path)
        train_path=train_path, dev_path=dev_path, config_path=config_path,
    )
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    train(
@ -63,6 +61,8 @@ def train_cli(
        {"train": train_path, "dev": dev_path},
        output_path=output_path,
        config_overrides=overrides,
        use_gpu=use_gpu,
        resume_training=resume,
    )
@ -72,61 +72,51 @@ def train(
    raw_text: Optional[Path] = None,
    output_path: Optional[Path] = None,
    config_overrides: Dict[str, Any] = {},
    use_gpu: int = -1,
    resume_training: bool = False,
 ) -> None:
    msg.info(f"Loading config from: {config_path}")
    # Read the config first without creating objects, to get to the original nlp_config
    with show_validation_error():
        config = util.load_config(
            config_path,
            create_objects=False,
            schema=ConfigSchema,
            overrides=config_overrides,
        )
    use_gpu = config["training"]["use_gpu"]
    if use_gpu >= 0:
        msg.info(f"Using GPU: {use_gpu}")
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
    msg.info(f"Loading config and nlp from: {config_path}")
    config = Config().from_disk(config_path)
    with show_validation_error():
        nlp, config = util.load_model_from_config(config, overrides=config_overrides)
    if config["training"]["base_model"]:
        base_nlp = util.load_model(config["training"]["base_model"])
        # TODO: do something to check base_nlp against regular nlp described in config?
        nlp = base_nlp
    verify_config(nlp)
    raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
-    if config["training"].get("use_pytorch_for_gpu_memory"):
+    if config["training"]["use_pytorch_for_gpu_memory"]:
        # It feels kind of weird to not have a default for this.
        use_pytorch_for_gpu_memory()
    nlp_config = config["nlp"]
    config = util.load_config(
        config_path,
        create_objects=True,
        schema=ConfigSchema,
        overrides=config_overrides,
    )
    training = config["training"]
    msg.info("Creating nlp from config")
    nlp = util.load_model_from_config(nlp_config)
    optimizer = training["optimizer"]
    limit = training["limit"]
    corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit)
-    if "textcat" in nlp_config["pipeline"]:
+    if resume_training:
        verify_textcat_config(nlp, nlp_config)
    if training.get("resume", False):
        msg.info("Resuming training")
        nlp.resume_training()
    else:
        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
-        train_examples = list(
+        train_examples = corpus.train_dataset(
            corpus.train_dataset(
            nlp,
            shuffle=False,
            gold_preproc=training["gold_preproc"],
            max_length=training["max_length"],
        )
-        )
+        train_examples = list(train_examples)
        nlp.begin_training(lambda: train_examples)
    if tag_map:
        # Replace tag map with provided mapping
        nlp.vocab.morphology.load_tag_map(tag_map)
-
+    if morph_rules:
        # Load morph rules
        nlp.vocab.morphology.load_morph_exceptions(morph_rules)
@ -151,9 +141,8 @@ def train(
        for subpath in tok2vec_path.split("."):
            tok2vec = tok2vec.get(subpath)
        if not tok2vec:
-            msg.fail(
+            err = f"Could not locate the tok2vec model at {tok2vec_path}"
-                f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
+            msg.fail(err, exits=1)
            )
        tok2vec.from_bytes(weights_data)
    msg.info("Loading training corpus")
@ -169,12 +158,11 @@ def train(
        evaluate,
        dropout=training["dropout"],
        accumulate_gradient=training["accumulate_gradient"],
-        patience=training.get("patience", 0),
+        patience=training["patience"],
-        max_steps=training.get("max_steps", 0),
+        max_steps=training["max_steps"],
        eval_frequency=training["eval_frequency"],
        raw_text=raw_text,
    )
    msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
    print_row = setup_printer(training, nlp)
@ -209,8 +197,10 @@ def train(
            msg.good(f"Saved model to output directory {final_model_path}")
-def create_train_batches(nlp, corpus, cfg):
+def create_train_batches(
-    max_epochs = cfg.get("max_epochs", 0)
+    nlp: Language, corpus: Corpus, cfg: Union[Config, Dict[str, Any]]
 ):
    max_epochs = cfg["max_epochs"]
    train_examples = list(
        corpus.train_dataset(
            nlp,
@ -219,9 +209,8 @@ def create_train_batches(nlp, corpus, cfg):
            max_length=cfg["max_length"],
        )
    )
    epoch = 0
-    batch_strategy = cfg.get("batch_by", "sequences")
+    batch_strategy = cfg["batch_by"]
    while True:
        if len(train_examples) == 0:
            raise ValueError(Errors.E988)
@ -241,7 +230,6 @@ def create_train_batches(nlp, corpus, cfg):
            )
        else:
            batches = util.minibatch(train_examples, size=cfg["batch_size"])
        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
        try:
            first = next(batches)
@ -255,18 +243,20 @@ def create_train_batches(nlp, corpus, cfg):
        random.shuffle(train_examples)
-def create_evaluation_callback(nlp, optimizer, corpus, cfg):
+def create_evaluation_callback(
-    def evaluate():
+    nlp: Language,
-        dev_examples = list(
+    optimizer: Optimizer,
-            corpus.dev_dataset(
+    corpus: Corpus,
    cfg: Union[Config, Dict[str, Any]],
 ) -> Callable[[], Tuple[float, Dict[str, float]]]:
    def evaluate() -> Tuple[float, Dict[str, float]]:
        dev_examples = corpus.dev_dataset(
            nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
        )
-        )
+        dev_examples = list(dev_examples)
        n_words = sum(len(ex.predicted) for ex in dev_examples)
-        batch_size = cfg.get("evaluation_batch_size", 128)
+        batch_size = cfg["eval_batch_size"]
        start_time = timer()
        if optimizer.averages:
            with nlp.use_params(optimizer.averages):
                scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
@ -280,12 +270,9 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
        try:
            weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
        except KeyError as e:
-            raise KeyError(
+            keys = list(scores.keys())
-                Errors.E983.format(
+            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
-                    dict="score_weights", key=str(e), keys=list(scores.keys())
+            raise KeyError(err)
                )
            )
        scores["speed"] = wps
        return weighted_score, scores
@ -293,17 +280,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
 def train_while_improving(
-    nlp,
+    nlp: Language,
-    optimizer,
+    optimizer: Optimizer,
    train_data,
    evaluate,
    *,
-    dropout,
+    dropout: float,
-    eval_frequency,
+    eval_frequency: int,
-    accumulate_gradient=1,
+    accumulate_gradient: int,
-    patience=0,
+    patience: int,
-    max_steps=0,
+    max_steps: int,
-    raw_text=None,
+    raw_text: List[Dict[str, str]],
 ):
    """Train until an evaluation stops improving. Works as a generator,
    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@ -414,7 +401,9 @@ def subdivide_batch(batch, accumulate_gradient):
        yield subbatch
-def setup_printer(training, nlp):
+def setup_printer(
    training: Union[Dict[str, Any], Config], nlp: Language
 ) -> Callable[[Dict[str, Any]], None]:
    score_cols = training["scores"]
    score_widths = [max(len(col), 6) for col in score_cols]
    loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
@ -423,11 +412,10 @@ def setup_printer(training, nlp):
    table_header = [col.upper() for col in table_header]
    table_widths = [3, 6] + loss_widths + score_widths + [6]
    table_aligns = ["r" for _ in table_widths]
    msg.row(table_header, widths=table_widths)
    msg.row(["-" * width for width in table_widths])
-    def print_row(info):
+    def print_row(info: Dict[str, Any]) -> None:
        try:
            losses = [
                "{0:.2f}".format(float(info["losses"][pipe_name]))
@ -463,7 +451,9 @@ def setup_printer(training, nlp):
    return print_row
-def update_meta(training, nlp, info):
+def update_meta(
    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
 ) -> None:
    score_cols = training["scores"]
    nlp.meta["performance"] = {}
    for metric in score_cols:
@ -472,7 +462,9 @@ def update_meta(training, nlp, info):
        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
-def load_from_paths(config):
+def load_from_paths(
    config: Config,
 ) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
    # TODO: separate checks from loading
    raw_text = util.ensure_path(config["training"]["raw_text"])
    if raw_text is not None:
@ -506,7 +498,7 @@ def verify_cli_args(
    dev_path: Path,
    config_path: Path,
    output_path: Optional[Path] = None,
-):
+) -> None:
    # Make sure all files and paths exists if they are needed
    if not config_path or not config_path.exists():
        msg.fail("Config file not found", config_path, exits=1)
@ -528,12 +520,23 @@ def verify_cli_args(
            )
-def verify_textcat_config(nlp, nlp_config):
+def verify_config(nlp: Language) -> None:
    """Perform additional checks based on the config and loaded nlp object."""
    # TODO: maybe we should validate based on the actual components, the list
    # in config["nlp"]["pipeline"] instead?
    for pipe_config in nlp.config["components"].values():
        # We can't assume that the component name == the factory
        factory = pipe_config["@factories"]
        if factory == "textcat":
            verify_textcat_config(nlp, pipe_config)
 def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
    # if 'positive_label' is provided: double check whether it's in the data and
    # the task is binary
-    if nlp_config["pipeline"]["textcat"].get("positive_label", None):
+    if pipe_config.get("positive_label"):
        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
-        pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
+        pos_label = pipe_config.get("positive_label")
        if pos_label not in textcat_labels:
            msg.fail(
                f"The textcat's 'positive_label' config setting '{pos_label}' "
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -0,0 +1,102 @@
 [nlp]
 lang = null
 stop_words = []
 lex_attr_getters = {}
 pipeline = []
 [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.writing_system]
 direction = "ltr"
 has_case = true
 has_letters = true
 [components]
 # Training hyper-parameters and additional features.
 [training]
 # Whether to train on sequences with 'gold standard' sentence boundaries
 # and tokens. If you set this to true, take care to ensure your run-time
 # data is passed in sentence-by-sentence via some prior preprocessing.
 gold_preproc = false
 # Limitations on training document length or number of examples.
 max_length = 5000
 limit = 0
 # Data augmentation
 orth_variant_level = 0.0
 dropout = 0.1
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
 eval_batch_size = 128
 # Other settings
 seed = 0
 accumulate_gradient = 1
 use_pytorch_for_gpu_memory = false
 # Control how scores are printed and checkpoints are evaluated.
 scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
 score_weights = {"tags_acc": 0.2, "las": 0.4, "ents_f": 0.4}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
 discard_oversize = false
 omit_extra_lookups = false
 batch_by = "sequences"
 raw_text = null
 tag_map = null
 morph_rules = null
 base_model = null
 vectors = null
 [training.batch_size]
@schedules = "compounding.v1"
 start = 1000
 stop = 1000
 compound = 1.001
 [training.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 1e-8
 [training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
 warmup_steps = 250
 total_steps = 20000
 initial_rate = 0.001
 [pretraining]
 max_epochs = 1000
 min_length = 5
 max_length = 500
 dropout = 0.2
 n_save_every = null
 batch_size = 3000
 seed = ${training:seed}
 use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
 tok2vec_model = "components.tok2vec.model"
 [pretraining.objective]
 type = "characters"
 n_characters = 4
 [pretraining.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
 use_averages = true
 eps = 1e-8
 learn_rate = 0.001
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -124,20 +124,24 @@ class Warnings:
@add_codes
 class Errors:
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
-    E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
+    E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
-            "calls `nlp.create_pipe` with a component name that's not built "
+            "This usually happens when spaCy calls nlp.{method} with custom "
-            "in - for example, when constructing the pipeline from a model's "
+            "component name that's not registered on the current language class. "
-            "meta.json. If you're using a custom component, you can write to "
+            "If you're using a custom component, make sure you've added the "
-            "`Language.factories['{name}']` or remove it from the model meta "
+            "decorator @Language.component (for function components) or "
-            "and add it via `nlp.add_pipe` instead.")
+            "@Language.factory (for class components).\n\nAvailable "
            "factories: {opts}")
    E003 = ("Not a valid pipeline component. Expected callable, but "
-            "got {component} (name: '{name}').")
+            "got {component} (name: '{name}'). If you're using a custom "
-    E004 = ("If you meant to add a built-in component, use `create_pipe`: "
+            "component factory, double-check that it correctly returns your "
-            "`nlp.add_pipe(nlp.create_pipe('{component}'))`")
+            "initialized component.")
    E004 = ("Can't set up pipeline component: a factory for '{name}' already exists.")
    E005 = ("Pipeline component '{name}' returned None. If you're using a "
            "custom component, maybe you forgot to return the processed Doc?")
-    E006 = ("Invalid constraints. You can only set one of the following: "
+    E006 = ("Invalid constraints for adding pipeline component. You can only "
-            "before, after, first, last.")
+            "set one of the following: before (component name or index), "
            "after (component name or index), first (True) or last (True). "
            "Invalid configuration: {args}. Existing components: {opts}")
    E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
    E008 = ("Some current components would be lost when restoring previous "
            "pipeline state. If you added components after calling "
@ -184,7 +188,7 @@ class Errors:
            "the documentation:\nhttps://spacy.io/usage/models")
    E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
            "component to the pipeline with: "
-            "nlp.add_pipe(nlp.create_pipe('sentencizer')). "
+            "nlp.add_pipe('sentencizer'). "
            "Alternatively, add the dependency parser, or set sentence "
            "boundaries by setting doc[i].is_sent_start.")
    E031 = ("Invalid token: empty string ('') at position {i}.")
@ -365,8 +369,6 @@ class Errors:
    E133 = ("The sum of prior probabilities for alias '{alias}' should not "
            "exceed 1, but found {sum}.")
    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
    E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
            "`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
    E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
            "to provide a valid JSON object as input with either the `text` "
            "or `tokens` key. For more info, see the docs:\n"
@ -484,6 +486,62 @@ class Errors:
    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
    # TODO: fix numbering after merging develop into master
    E956 = ("Can't find component '{name}' in [components] block in the config. "
            "Available components: {opts}")
    E957 = ("Writing directly to Language.factories isn't needed anymore in "
            "spaCy v3. Instead, you can use the @Language.factory decorator "
            "to register your custom component factory or @Language.component "
            "to register a simple stateless function component that just takes "
            "a Doc and returns it.")
    E958 = ("Language code defined in config ({bad_lang_code}) does not match "
            "language code of current Language subclass {lang} ({lang_code})")
    E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
    E960 = ("No config data found for component '{name}'. This is likely a bug "
            "in spaCy.")
    E961 = ("Found non-serializable Python object in config. Configs should "
            "only include values that can be serialized to JSON. If you need "
            "to pass models or other objects to your component, use a reference "
            "to a registered function or initialize the object in your "
            "component.\n\n{config}")
    E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, "
            "got: {cfg_type}.")
    E963 = ("Can't read component info from @Language.{decorator} decorator. "
            "Maybe you forgot to call it? Make sure you're using "
            "@Language.{decorator}() instead of @Language.{decorator}.")
    E964 = ("The pipeline component factory for '{name}' needs to have the "
            "following named arguments, which are passed in by spaCy:\n- nlp: "
            "receives the current nlp object and lets you access the vocab\n- "
            "name: the name of the component instance, can be used to identify "
            "the component, output losses etc.")
    E965 = ("It looks like you're using the @Language.component decorator to "
            "register '{name}' on a class instead of a function component. If "
            "you need to register a class or function that *returns* a component "
            "function, use the @Language.factory decorator instead.")
    E966 = ("nlp.add_pipe now takes the string name of the registered component "
            "factory, not a callable component. Expected string, but got "
            "{component} (name: '{name}').\n\n- If you created your component "
            "with nlp.create_pipe('name'): remove nlp.create_pipe and call "
            "nlp.add_pipe('name') instead.\n\n- If you passed in a component "
            "like TextCategorizer(): call nlp.add_pipe with the string name "
            "instead, e.g. nlp.add_pipe('textcat').\n\n- If you're using a custom "
            "component: Add the decorator @Language.component (for function "
            "components) or @Language.factory (for class components / factories) "
            "to your custom component and assign it a name, e.g. "
            "@Language.component('your_name'). You can then run "
            "nlp.add_pipe('your_name') to add it to the pipeline.")
    E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.")
    E968 = ("nlp.replace_pipe now takes the string name of the registered component "
            "factory, not a callable component. Expected string, but got "
            "{component}.\n\n- If you created your component with"
            "with nlp.create_pipe('name'): remove nlp.create_pipe and call "
            "nlp.replace_pipe('{name}', 'name') instead.\n\n- If you passed in a "
            "component like TextCategorizer(): call nlp.replace_pipe with the "
            "string name instead, e.g. nlp.replace_pipe('{name}', 'textcat').\n\n"
            "- If you're using a custom component: Add the decorator "
            "@Language.component (for function components) or @Language.factory "
            "(for class components / factories) to your custom component and "
            "assign it a name, e.g. @Language.component('your_name'). You can "
            "then run nlp.replace_pipe('{name}', 'your_name').")
    E969 = ("Expected string values for field '{field}', but received {types} instead. ")
    E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
    E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
@ -506,10 +564,12 @@ class Errors:
            "into {values}, but found {value}.")
    E983 = ("Invalid key for '{dict}': {key}. Available keys: "
            "{keys}")
-    E985 = ("The pipeline component '{component}' is already available in the base "
+    E984 = ("Invalid component config for '{name}': no @factories key "
-            "model. The settings in the component block in the config file are "
+            "specifying the registered function used to initialize the "
-            "being ignored. If you want to replace this component instead, set "
+            "component. For example, @factories = \"ner\" will use the 'ner' "
-            "'replace' to True in the training configuration.")
+            "factory and all other settings in the block will be passed "
            "to it as arguments.\n\n{config}")
    E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}")
    E986 = ("Could not create any training batches: check your input. "
            "Perhaps discard_oversize should be set to False ?")
    E987 = ("The text of an example training instance is either a Doc or "
@ -530,9 +590,9 @@ class Errors:
    E992 = ("The function `select_pipes` was called with `enable`={enable} "
            "and `disable`={disable} but that information is conflicting "
            "for the `nlp` pipeline with components {names}.")
-    E993 = ("The config for 'nlp' should include either a key 'name' to "
+    E993 = ("The config for 'nlp' needs to include a key 'lang' specifying "
-            "refer to an existing model by name or path, or a key 'lang' "
+            "the code of the language to initialize it with (for example "
-            "to create a new blank model.")
+            "'en' for English).\n\n{config}")
    E996 = ("Could not parse {file}: {msg}")
    E997 = ("Tokenizer special cases are not allowed to modify the text. "
            "This would map '{chunk}' to '{orth}' given token attributes "
@ -540,9 +600,9 @@ class Errors:
    E999 = ("Unable to merge the `Doc` objects because they do not all share "
            "the same `Vocab`.")
    E1000 = ("No pkuseg model available. Provide a pkuseg model when "
-            "initializing the pipeline: "
+             "initializing the pipeline:\n"
-            '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
+             'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m'
-            'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')
+             'nlp = Chinese(config=cfg)')
@add_codes
--- a/spacy/gold/converters/conllu2docs.py
+++ b/spacy/gold/converters/conllu2docs.py
@ -1,10 +1,9 @@
 import re
 from .conll_ner2docs import n_sents_info
 from ...gold import Example
 from ...gold import iob_to_biluo, spans_from_biluo_tags
 from ...language import Language
 from ...tokens import Doc, Token, Span
 from ...vocab import Vocab
 from wasabi import Printer
@ -73,7 +72,7 @@ def read_conllx(
    ner_map=None,
 ):
    """ Yield docs, one for each sentence """
-    vocab = Language.Defaults.create_vocab()  # need vocab to make a minimal Doc
+    vocab = Vocab()  # need vocab to make a minimal Doc
    for sent in input_data.strip().split("\n\n"):
        lines = sent.strip().split("\n")
        if lines:
--- a/spacy/lang/af/init.py
+++ b/spacy/lang/af/init.py
@ -1,17 +1,26 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class AfrikaansDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters[LANG] = lambda text: "af"
+lang = "af"
-    stop_words = STOP_WORDS
+stop_words = {"@language_data": "spacy.af.stop_words"}
 """
@registry.language_data("spacy.af.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class Afrikaans(Language):
    lang = "af"
-    Defaults = AfrikaansDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Afrikaans"]
--- a/spacy/lang/ar/init.py
+++ b/spacy/lang/ar/init.py
@ -1,31 +1,48 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
-from ...util import update_exc, add_lookups
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "ar"
 stop_words = {"@language_data": "spacy.ar.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
 [nlp.writing_system]
 direction = "rtl"
 has_case = false
 has_letters = true
 """
@registry.language_data("spacy.ar.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.ar.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class ArabicDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "ar"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    suffixes = TOKENIZER_SUFFIXES
    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 class Arabic(Language):
    lang = "ar"
    Defaults = ArabicDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Arabic"]
--- a/spacy/lang/bg/init.py
+++ b/spacy/lang/bg/init.py
@ -1,17 +1,26 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class BulgarianDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters[LANG] = lambda text: "bg"
+lang = "bg"
-    stop_words = STOP_WORDS
+stop_words = {"@language_data": "spacy.bg.stop_words"}
 """
@registry.language_data("spacy.bg.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class Bulgarian(Language):
    lang = "bg"
-    Defaults = BulgarianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Bulgarian"]
--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -1,18 +1,35 @@
 from typing import Set
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import update_exc, registry
-from ...util import update_exc
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "bn"
 stop_words = {"@language_data": "spacy.bn.stop_words"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.bn.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class BengaliDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "bn"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
@ -21,6 +38,7 @@ class BengaliDefaults(Language.Defaults):
 class Bengali(Language):
    lang = "bn"
    Defaults = BengaliDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Bengali"]
--- a/spacy/lang/ca/init.py
+++ b/spacy/lang/ca/init.py
@ -1,31 +1,49 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
 from ...util import update_exc, add_lookups
 from .punctuation import TOKENIZER_INFIXES
 DEFAULT_CONFIG = """
 [nlp]
 lang = "ca"
 stop_words = {"@language_data": "spacy.ca.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.ca.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.ca.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class CatalanDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "ca"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    infixes = TOKENIZER_INFIXES
 class Catalan(Language):
    lang = "ca"
    Defaults = CatalanDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Catalan"]
--- a/spacy/lang/cs/init.py
+++ b/spacy/lang/cs/init.py
@ -1,17 +1,26 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class CzechDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters[LANG] = lambda text: "cs"
+lang = "cs"
-    stop_words = STOP_WORDS
+stop_words = {"@language_data": "spacy.cs.stop_words"}
 """
@registry.language_data("spacy.cs.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class Czech(Language):
    lang = "cs"
-    Defaults = CzechDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Czech"]
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -1,27 +1,50 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import update_exc, registry
-from ...util import update_exc
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "da"
 stop_words = {"@language_data": "spacy.da.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.da.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.da.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class DanishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "da"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    stop_words = STOP_WORDS
 class Danish(Language):
    lang = "da"
    Defaults = DanishDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Danish"]
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -1,23 +1,40 @@
 from typing import Set
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .punctuation import TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import update_exc, registry
-from ...util import update_exc
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "de"
 stop_words = {"@language_data": "spacy.de.stop_words"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.de.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class GermanDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "de"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
    single_orth_variants = [
        {"tags": ["$("], "variants": ["…", "..."]},
@ -38,6 +55,7 @@ class GermanDefaults(Language.Defaults):
 class German(Language):
    lang = "de"
    Defaults = GermanDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["German"]
--- a/spacy/lang/el/init.py
+++ b/spacy/lang/el/init.py
@ -1,3 +1,6 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -6,32 +9,51 @@ from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...lookups import Lookups
+from ...util import update_exc, registry
-from ...attrs import LANG
+
-from ...util import update_exc
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "el"
 stop_words = {"@language_data": "spacy.el.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.GreekLemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.lemmatizers("spacy.GreekLemmatizer.v1")
 def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer:
    return GreekLemmatizer(data_paths=data_paths)
@registry.language_data("spacy.el.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.el.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class GreekDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "el"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
    syntax_iterators = SYNTAX_ITERATORS
    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
        if lookups is None:
            lookups = Lookups()
        return GreekLemmatizer(lookups)
 class Greek(Language):
    lang = "el"
    Defaults = GreekDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Greek"]
--- a/spacy/lang/el/lemmatizer.py
+++ b/spacy/lang/el/lemmatizer.py
@ -1,3 +1,5 @@
 from typing import Dict, List
 from ...lemmatizer import Lemmatizer
@ -12,7 +14,13 @@ class GreekLemmatizer(Lemmatizer):
    not applicable for Greek language.
    """
-    def lemmatize(self, string, index, exceptions, rules):
+    def lemmatize(
        self,
        string: str,
        index: Dict[str, List[str]],
        exceptions: Dict[str, Dict[str, List[str]]],
        rules: Dict[str, List[List[str]]],
    ) -> List[str]:
        string = string.lower()
        forms = []
        if string in index:
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -1,25 +1,50 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
-
+from .lemmatizer import is_base_form
 from .punctuation import TOKENIZER_INFIXES
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...lemmatizer import Lemmatizer
-from ...util import update_exc
+from ...util import update_exc, registry
-def _return_en(_):
+DEFAULT_CONFIG = """
-    return "en"
+[nlp]
 lang = "en"
 stop_words = {"@language_data": "spacy.en.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.EnglishLemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.en.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.en.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
 def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
    return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form)
 class EnglishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = _return_en
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
    infixes = TOKENIZER_INFIXES
    single_orth_variants = [
@ -31,45 +56,11 @@ class EnglishDefaults(Language.Defaults):
        {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
    ]
    @classmethod
    def is_base_form(cls, univ_pos, morphology=None):
        """
        Check whether we're dealing with an uninflected paradigm, so we can
        avoid lemmatization entirely.
        univ_pos (unicode / int): The token's universal part-of-speech tag.
        morphology (dict): The token's morphological features following the
            Universal Dependencies scheme.
        """
        if morphology is None:
            morphology = {}
        if univ_pos == "noun" and morphology.get("Number") == "sing":
            return True
        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
            return True
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
        # morphology
        elif univ_pos == "verb" and (
            morphology.get("VerbForm") == "fin"
            and morphology.get("Tense") == "pres"
            and morphology.get("Number") is None
        ):
            return True
        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
            return True
        elif morphology.get("VerbForm") == "inf":
            return True
        elif morphology.get("VerbForm") == "none":
            return True
        elif morphology.get("Degree") == "pos":
            return True
        else:
            return False
 class English(Language):
    lang = "en"
    Defaults = EnglishDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["English"]
--- a/spacy/lang/en/lemmatizer.py
+++ b/spacy/lang/en/lemmatizer.py
@ -0,0 +1,36 @@
 from typing import Optional
 def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
    """
    Check whether we're dealing with an uninflected paradigm, so we can
    avoid lemmatization entirely.
    univ_pos (unicode / int): The token's universal part-of-speech tag.
    morphology (dict): The token's morphological features following the
        Universal Dependencies scheme.
    """
    if morphology is None:
        morphology = {}
    if univ_pos == "noun" and morphology.get("Number") == "sing":
        return True
    elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
        return True
    # This maps 'VBP' to base form -- probably just need 'IS_BASE'
    # morphology
    elif univ_pos == "verb" and (
        morphology.get("VerbForm") == "fin"
        and morphology.get("Tense") == "pres"
        and morphology.get("Number") is None
    ):
        return True
    elif univ_pos == "adj" and morphology.get("Degree") == "pos":
        return True
    elif morphology.get("VerbForm") == "inf":
        return True
    elif morphology.get("VerbForm") == "none":
        return True
    elif morphology.get("Degree") == "pos":
        return True
    else:
        return False
--- a/spacy/lang/en/lex_attrs.py
+++ b/spacy/lang/en/lex_attrs.py
@ -1,47 +1,17 @@
 from ...attrs import LIKE_NUM
-
+# fmt: off
 _num_words = [
-    "zero",
+    "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
-    "one",
+    "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
-    "two",
+    "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
-    "three",
+    "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
-    "four",
+    "million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "ten",
    "eleven",
    "twelve",
    "thirteen",
    "fourteen",
    "fifteen",
    "sixteen",
    "seventeen",
    "eighteen",
    "nineteen",
    "twenty",
    "thirty",
    "forty",
    "fifty",
    "sixty",
    "seventy",
    "eighty",
    "ninety",
    "hundred",
    "thousand",
    "million",
    "billion",
    "trillion",
    "quadrillion",
    "gajillion",
    "bazillion",
 ]
 # fmt: on
-def like_num(text):
+def like_num(text: str) -> bool:
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
--- a/spacy/lang/es/init.py
+++ b/spacy/lang/es/init.py
@ -1,33 +1,52 @@
 from typing import Set, Dict, Callable, Any
 from thinc.config import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
-from ...util import update_exc, add_lookups
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "es"
 stop_words = {"@language_data": "spacy.es.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.es.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.es.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class SpanishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "es"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
 class Spanish(Language):
    lang = "es"
    Defaults = SpanishDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Spanish"]
--- a/spacy/lang/et/init.py
+++ b/spacy/lang/et/init.py
@ -1,17 +1,26 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class EstonianDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters[LANG] = lambda text: "et"
+lang = "et"
-    stop_words = STOP_WORDS
+stop_words = {"@language_data": "spacy.et.stop_words"}
 """
@registry.language_data("spacy.et.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class Estonian(Language):
    lang = "et"
-    Defaults = EstonianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Estonian"]
--- a/spacy/lang/eu/init.py
+++ b/spacy/lang/eu/init.py
@ -1,25 +1,41 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
 DEFAULT_CONFIG = """
 [nlp]
 lang = "eu"
 stop_words = {"@language_data": "spacy.eu.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
 """
@registry.language_data("spacy.eu.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.eu.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class BasqueDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "eu"
    tokenizer_exceptions = BASE_EXCEPTIONS
    stop_words = STOP_WORDS
    suffixes = TOKENIZER_SUFFIXES
 class Basque(Language):
    lang = "eu"
    Defaults = BasqueDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Basque"]
--- a/spacy/lang/fa/init.py
+++ b/spacy/lang/fa/init.py
@ -1,7 +1,8 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
 from ...util import update_exc, add_lookups
 from ..norm_exceptions import BASE_NORMS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -9,23 +10,46 @@ from .punctuation import TOKENIZER_SUFFIXES
 from .syntax_iterators import SYNTAX_ITERATORS
 DEFAULT_CONFIG = """
 [nlp]
 lang = "fa"
 stop_words = {"@language_data": "spacy.fa.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
 [nlp.writing_system]
 direction = "rtl"
 has_case = false
 has_letters = true
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.fa.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.fa.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class PersianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    lex_attr_getters[LANG] = lambda text: "fa"
    tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    suffixes = TOKENIZER_SUFFIXES
    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
    syntax_iterators = SYNTAX_ITERATORS
 class Persian(Language):
    lang = "fa"
    Defaults = PersianDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Persian"]
--- a/spacy/lang/fi/init.py
+++ b/spacy/lang/fi/init.py
@ -1,31 +1,43 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
-from ...util import update_exc, add_lookups
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "fi"
 stop_words = {"@language_data": "spacy.fi.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
 """
@registry.language_data("spacy.fi.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.fi.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class FinnishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "fi"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
 class Finnish(Language):
    lang = "fi"
    Defaults = FinnishDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Finnish"]
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -1,44 +1,61 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-from .lemmatizer import FrenchLemmatizer
+from .lemmatizer import FrenchLemmatizer, is_base_form
 from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lookups import Lookups
+from ...util import update_exc, registry
-from ...attrs import LANG, NORM
+
-from ...util import update_exc, add_lookups
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "fr"
 stop_words = {"@language_data": "spacy.fr.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.FrenchLemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
 def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer:
    return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form)
@registry.language_data("spacy.fr.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.fr.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class FrenchDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "fr"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    token_match = TOKEN_MATCH
    syntax_iterators = SYNTAX_ITERATORS
    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
        if lookups is None:
            lookups = Lookups()
        return FrenchLemmatizer(lookups)
 class French(Language):
    lang = "fr"
    Defaults = FrenchDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["French"]
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@ -1,3 +1,5 @@
 from typing import Optional, List, Dict
 from ...lemmatizer import Lemmatizer
 from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
 from ...symbols import SCONJ, CCONJ
@ -13,7 +15,9 @@ class FrenchLemmatizer(Lemmatizer):
    the lookup table.
    """
-    def __call__(self, string, univ_pos, morphology=None):
+    def __call__(
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
    ) -> List[str]:
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        if "lemma_rules" not in self.lookups:
            return [lookup_table.get(string, string)]
@ -52,7 +56,47 @@ class FrenchLemmatizer(Lemmatizer):
        )
        return lemmas
-    def is_base_form(self, univ_pos, morphology=None):
+    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        if orth is not None and orth in lookup_table:
            return lookup_table[orth][0]
        return string
    def lemmatize(
        self,
        string: str,
        index: Dict[str, List[str]],
        exceptions: Dict[str, Dict[str, List[str]]],
        rules: Dict[str, List[List[str]]],
    ) -> List[str]:
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        string = string.lower()
        forms = []
        if string in index:
            forms.append(string)
            return forms
        forms.extend(exceptions.get(string, []))
        oov_forms = []
        if not forms:
            for old, new in rules:
                if string.endswith(old):
                    form = string[: len(string) - len(old)] + new
                    if not form:
                        pass
                    elif form in index or not form.isalpha():
                        forms.append(form)
                    else:
                        oov_forms.append(form)
        if not forms:
            forms.extend(oov_forms)
        if not forms and string in lookup_table.keys():
            forms.append(lookup_table[string][0])
        if not forms:
            forms.append(string)
        return list(set(forms))
 def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
    """
    Check whether we're dealing with an uninflected paradigm, so we can
    avoid lemmatization entirely.
@ -88,48 +132,3 @@ class FrenchLemmatizer(Lemmatizer):
        return True
    else:
        return False
    def noun(self, string, morphology=None):
        return self(string, "noun", morphology)
    def verb(self, string, morphology=None):
        return self(string, "verb", morphology)
    def adj(self, string, morphology=None):
        return self(string, "adj", morphology)
    def punct(self, string, morphology=None):
        return self(string, "punct", morphology)
    def lookup(self, string, orth=None):
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        if orth is not None and orth in lookup_table:
            return lookup_table[orth][0]
        return string
    def lemmatize(self, string, index, exceptions, rules):
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        string = string.lower()
        forms = []
        if string in index:
            forms.append(string)
            return forms
        forms.extend(exceptions.get(string, []))
        oov_forms = []
        if not forms:
            for old, new in rules:
                if string.endswith(old):
                    form = string[: len(string) - len(old)] + new
                    if not form:
                        pass
                    elif form in index or not form.isalpha():
                        forms.append(form)
                    else:
                        oov_forms.append(form)
        if not forms:
            forms.extend(oov_forms)
        if not forms and string in lookup_table.keys():
            forms.append(lookup_table[string][0])
        if not forms:
            forms.append(string)
        return list(set(forms))
--- a/spacy/lang/ga/init.py
+++ b/spacy/lang/ga/init.py
@ -1,23 +1,33 @@
 from typing import Set
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import update_exc, registry
-from ...util import update_exc
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "ga"
 stop_words = {"@language_data": "spacy.ga.stop_words"}
 """
@registry.language_data("spacy.ga.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class IrishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "ga"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
 class Irish(Language):
    lang = "ga"
    Defaults = IrishDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Irish"]
--- a/spacy/lang/gu/init.py
+++ b/spacy/lang/gu/init.py
@ -1,15 +1,26 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ...language import Language
 from ...util import registry
-class GujaratiDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    stop_words = STOP_WORDS
+[nlp]
 lang = "gu"
 stop_words = {"@language_data": "spacy.gu.stop_words"}
 """
@registry.language_data("spacy.gu.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class Gujarati(Language):
    lang = "gu"
-    Defaults = GujaratiDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Gujarati"]
--- a/spacy/lang/he/init.py
+++ b/spacy/lang/he/init.py
@ -1,22 +1,37 @@
-from .stop_words import STOP_WORDS
+from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import update_exc, registry
-from ...util import update_exc
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "he"
 stop_words = {"@language_data": "spacy.he.stop_words"}
 [nlp.writing_system]
 direction = "rtl"
 has_case = false
 has_letters = true
 """
@registry.language_data("spacy.he.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class HebrewDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "he"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
    stop_words = STOP_WORDS
    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 class Hebrew(Language):
    lang = "he"
    Defaults = HebrewDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Hebrew"]
--- a/spacy/lang/hi/init.py
+++ b/spacy/lang/hi/init.py
@ -1,20 +1,33 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class HindiDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters.update(LEX_ATTRS)
+lang = "hi"
-    lex_attr_getters[LANG] = lambda text: "hi"
+stop_words = {"@language_data": "spacy.hi.stop_words"}
-    stop_words = STOP_WORDS
+lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
 """
@registry.language_data("spacy.hi.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.hi.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class Hindi(Language):
    lang = "hi"
-    Defaults = HindiDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Hindi"]
--- a/spacy/lang/hr/init.py
+++ b/spacy/lang/hr/init.py
@ -1,25 +1,39 @@
-from .stop_words import STOP_WORDS
+from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
-from ...util import update_exc, add_lookups
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "hr"
 stop_words = {"@language_data": "spacy.hr.stop_words"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.hr.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class CroatianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "hr"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
    stop_words = STOP_WORDS
 class Croatian(Language):
    lang = "hr"
    Defaults = CroatianDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Croatian"]
--- a/spacy/lang/hu/init.py
+++ b/spacy/lang/hu/init.py
@ -1,22 +1,35 @@
 from typing import Set
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
-from ...util import update_exc, add_lookups
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "hu"
 stop_words = {"@language_data": "spacy.hu.stop_words"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.hu.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class HungarianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "hu"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
@ -26,6 +39,7 @@ class HungarianDefaults(Language.Defaults):
 class Hungarian(Language):
    lang = "hu"
    Defaults = HungarianDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Hungarian"]
--- a/spacy/lang/hy/init.py
+++ b/spacy/lang/hy/init.py
@ -1,21 +1,33 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...attrs import LANG
 from ...language import Language
 from ...util import registry
-class ArmenianDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters[LANG] = lambda text: "hy"
+lang = "hy"
 stop_words = {"@language_data": "spacy.hy.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
 """
-    lex_attr_getters.update(LEX_ATTRS)
+
-    stop_words = STOP_WORDS
+@registry.language_data("spacy.hy.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.hy.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class Armenian(Language):
    lang = "hy"
-    Defaults = ArmenianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Armenian"]
--- a/spacy/lang/id/init.py
+++ b/spacy/lang/id/init.py
@ -1,21 +1,43 @@
 from typing import Set, Dict, Callable, Any
 from thinc.config import Config
 from .stop_words import STOP_WORDS
 from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import update_exc, registry
-from ...util import update_exc
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "id"
 stop_words = {"@language_data": "spacy.id.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.id.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.id.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class IndonesianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "id"
    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
@ -25,6 +47,7 @@ class IndonesianDefaults(Language.Defaults):
 class Indonesian(Language):
    lang = "id"
    Defaults = IndonesianDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Indonesian"]
--- a/spacy/lang/is/init.py
+++ b/spacy/lang/is/init.py
@ -1,17 +1,26 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class IcelandicDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters[LANG] = lambda text: "is"
+lang = "is"
-    stop_words = STOP_WORDS
+stop_words = {"@language_data": "spacy.is.stop_words"}
 """
@registry.language_data("spacy.is.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class Icelandic(Language):
    lang = "is"
-    Defaults = IcelandicDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Icelandic"]
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -1,20 +1,34 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
-from ...util import update_exc, add_lookups
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "it"
 stop_words = {"@language_data": "spacy.it.stop_words"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.it.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class ItalianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "it"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
@ -24,6 +38,7 @@ class ItalianDefaults(Language.Defaults):
 class Italian(Language):
    lang = "it"
    Defaults = ItalianDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Italian"]
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -1,21 +1,187 @@
 from typing import Optional, Union, Dict, Any, Set
 from pathlib import Path
 import srsly
-from collections import namedtuple, OrderedDict
+from collections import namedtuple
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .tag_map import TAG_MAP
 from .tag_orth_map import TAG_ORTH_MAP
 from .tag_bigram_map import TAG_BIGRAM_MAP
 from ...attrs import LANG
 from ...compat import copy_reg
 from ...errors import Errors
 from ...language import Language
 from ...symbols import POS
 from ...tokens import Doc
-from ...util import DummyTokenizer
+from ...util import DummyTokenizer, registry
 from ... import util
 DEFAULT_CONFIG = """
 [nlp]
 lang = "ja"
 stop_words = {"@language_data": "spacy.ja.stop_words"}
 [nlp.tokenizer]
@tokenizers = "spacy.JapaneseTokenizer.v1"
 split_mode = null
 [nlp.writing_system]
 direction = "ltr"
 has_case = false
 has_letters = false
 """
@registry.language_data("spacy.ja.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
 def create_japanese_tokenizer(split_mode: Optional[str] = None):
    def japanese_tokenizer_factory(nlp):
        return JapaneseTokenizer(nlp, split_mode=split_mode)
    return japanese_tokenizer_factory
 class JapaneseTokenizer(DummyTokenizer):
    def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
        self.vocab = nlp.vocab
        self.split_mode = split_mode
        self.tokenizer = try_sudachi_import(self.split_mode)
    def __call__(self, text: str) -> Doc:
        # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
        sudachipy_tokens = self.tokenizer.tokenize(text)
        dtokens = self._get_dtokens(sudachipy_tokens)
        dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
        # create Doc with tag bi-gram based part-of-speech identification rules
        words, tags, inflections, lemmas, readings, sub_tokens_list = (
            zip(*dtokens) if dtokens else [[]] * 6
        )
        sub_tokens_list = list(sub_tokens_list)
        doc = Doc(self.vocab, words=words, spaces=spaces)
        next_pos = None  # for bi-gram rules
        for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
            token.tag_ = dtoken.tag
            if next_pos:  # already identified in previous iteration
                token.pos = next_pos
                next_pos = None
            else:
                token.pos, next_pos = resolve_pos(
                    token.orth_,
                    dtoken.tag,
                    tags[idx + 1] if idx + 1 < len(tags) else None,
                )
            # if there's no lemma info (it's an unk) just use the surface
            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
        doc.user_data["inflections"] = inflections
        doc.user_data["reading_forms"] = readings
        doc.user_data["sub_tokens"] = sub_tokens_list
        return doc
    def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
        sub_tokens_list = (
            self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
        )
        dtokens = [
            DetailedToken(
                token.surface(),  # orth
                "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
                ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
                token.dictionary_form(),  # lemma
                token.reading_form(),  # user_data['reading_forms']
                sub_tokens_list[idx]
                if sub_tokens_list
                else None,  # user_data['sub_tokens']
            )
            for idx, token in enumerate(sudachipy_tokens)
            if len(token.surface()) > 0
            # remove empty tokens which can be produced with characters like … that
        ]
        # Sudachi normalizes internally and outputs each space char as a token.
        # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
        return [
            t
            for idx, t in enumerate(dtokens)
            if idx == 0
            or not t.surface.isspace()
            or t.tag != "空白"
            or not dtokens[idx - 1].surface.isspace()
            or dtokens[idx - 1].tag != "空白"
        ]
    def _get_sub_tokens(self, sudachipy_tokens):
        if (
            self.split_mode is None or self.split_mode == "A"
        ):  # do nothing for default split mode
            return None
        sub_tokens_list = []  # list of (list of list of DetailedToken | None)
        for token in sudachipy_tokens:
            sub_a = token.split(self.tokenizer.SplitMode.A)
            if len(sub_a) == 1:  # no sub tokens
                sub_tokens_list.append(None)
            elif self.split_mode == "B":
                sub_tokens_list.append([self._get_dtokens(sub_a, False)])
            else:  # "C"
                sub_b = token.split(self.tokenizer.SplitMode.B)
                if len(sub_a) == len(sub_b):
                    dtokens = self._get_dtokens(sub_a, False)
                    sub_tokens_list.append([dtokens, dtokens])
                else:
                    sub_tokens_list.append(
                        [
                            self._get_dtokens(sub_a, False),
                            self._get_dtokens(sub_b, False),
                        ]
                    )
        return sub_tokens_list
    def _get_config(self) -> Dict[str, Any]:
        return {"split_mode": self.split_mode}
    def _set_config(self, config: Dict[str, Any] = {}) -> None:
        self.split_mode = config.get("split_mode", None)
    def to_bytes(self, **kwargs) -> bytes:
        serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
        return util.to_bytes(serializers, [])
    def from_bytes(self, data: bytes, **kwargs) -> "JapaneseTokenizer":
        deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
        util.from_bytes(data, deserializers, [])
        self.tokenizer = try_sudachi_import(self.split_mode)
        return self
    def to_disk(self, path: Union[str, Path], **kwargs) -> None:
        path = util.ensure_path(path)
        serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
        return util.to_disk(path, serializers, [])
    def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer":
        path = util.ensure_path(path)
        serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
        util.from_disk(path, serializers, [])
        self.tokenizer = try_sudachi_import(self.split_mode)
        return self
 class JapaneseDefaults(Language.Defaults):
    tag_map = TAG_MAP
    syntax_iterators = SYNTAX_ITERATORS
 class Japanese(Language):
    lang = "ja"
    Defaults = JapaneseDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 # Hold the attributes we need with convenient names
 DetailedToken = namedtuple(
    "DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
@ -133,161 +299,6 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
    return text_dtokens, text_spaces
 class JapaneseTokenizer(DummyTokenizer):
    def __init__(self, cls, nlp=None, config={}):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        self.split_mode = config.get("split_mode", None)
        self.tokenizer = try_sudachi_import(self.split_mode)
    def __call__(self, text):
        # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
        sudachipy_tokens = self.tokenizer.tokenize(text)
        dtokens = self._get_dtokens(sudachipy_tokens)
        dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
        # create Doc with tag bi-gram based part-of-speech identification rules
        words, tags, inflections, lemmas, readings, sub_tokens_list = (
            zip(*dtokens) if dtokens else [[]] * 6
        )
        sub_tokens_list = list(sub_tokens_list)
        doc = Doc(self.vocab, words=words, spaces=spaces)
        next_pos = None  # for bi-gram rules
        for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
            token.tag_ = dtoken.tag
            if next_pos:  # already identified in previous iteration
                token.pos = next_pos
                next_pos = None
            else:
                token.pos, next_pos = resolve_pos(
                    token.orth_,
                    dtoken.tag,
                    tags[idx + 1] if idx + 1 < len(tags) else None,
                )
            # if there's no lemma info (it's an unk) just use the surface
            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
        doc.user_data["inflections"] = inflections
        doc.user_data["reading_forms"] = readings
        doc.user_data["sub_tokens"] = sub_tokens_list
        return doc
    def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
        sub_tokens_list = (
            self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
        )
        dtokens = [
            DetailedToken(
                token.surface(),  # orth
                "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
                ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
                token.dictionary_form(),  # lemma
                token.reading_form(),  # user_data['reading_forms']
                sub_tokens_list[idx]
                if sub_tokens_list
                else None,  # user_data['sub_tokens']
            )
            for idx, token in enumerate(sudachipy_tokens)
            if len(token.surface()) > 0
            # remove empty tokens which can be produced with characters like … that
        ]
        # Sudachi normalizes internally and outputs each space char as a token.
        # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
        return [
            t
            for idx, t in enumerate(dtokens)
            if idx == 0
            or not t.surface.isspace()
            or t.tag != "空白"
            or not dtokens[idx - 1].surface.isspace()
            or dtokens[idx - 1].tag != "空白"
        ]
    def _get_sub_tokens(self, sudachipy_tokens):
        if (
            self.split_mode is None or self.split_mode == "A"
        ):  # do nothing for default split mode
            return None
        sub_tokens_list = []  # list of (list of list of DetailedToken | None)
        for token in sudachipy_tokens:
            sub_a = token.split(self.tokenizer.SplitMode.A)
            if len(sub_a) == 1:  # no sub tokens
                sub_tokens_list.append(None)
            elif self.split_mode == "B":
                sub_tokens_list.append([self._get_dtokens(sub_a, False)])
            else:  # "C"
                sub_b = token.split(self.tokenizer.SplitMode.B)
                if len(sub_a) == len(sub_b):
                    dtokens = self._get_dtokens(sub_a, False)
                    sub_tokens_list.append([dtokens, dtokens])
                else:
                    sub_tokens_list.append(
                        [
                            self._get_dtokens(sub_a, False),
                            self._get_dtokens(sub_b, False),
                        ]
                    )
        return sub_tokens_list
    def _get_config(self):
        config = OrderedDict((("split_mode", self.split_mode),))
        return config
    def _set_config(self, config={}):
        self.split_mode = config.get("split_mode", None)
    def to_bytes(self, **kwargs):
        serializers = OrderedDict(
            (("cfg", lambda: srsly.json_dumps(self._get_config())),)
        )
        return util.to_bytes(serializers, [])
    def from_bytes(self, data, **kwargs):
        deserializers = OrderedDict(
            (("cfg", lambda b: self._set_config(srsly.json_loads(b))),)
        )
        util.from_bytes(data, deserializers, [])
        self.tokenizer = try_sudachi_import(self.split_mode)
        return self
    def to_disk(self, path, **kwargs):
        path = util.ensure_path(path)
        serializers = OrderedDict(
            (("cfg", lambda p: srsly.write_json(p, self._get_config())),)
        )
        return util.to_disk(path, serializers, [])
    def from_disk(self, path, **kwargs):
        path = util.ensure_path(path)
        serializers = OrderedDict(
            (("cfg", lambda p: self._set_config(srsly.read_json(p))),)
        )
        util.from_disk(path, serializers, [])
        self.tokenizer = try_sudachi_import(self.split_mode)
 class JapaneseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda _text: "ja"
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
    syntax_iterators = SYNTAX_ITERATORS
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
    @classmethod
    def create_tokenizer(cls, nlp=None, config={}):
        return JapaneseTokenizer(cls, nlp, config)
 class Japanese(Language):
    lang = "ja"
    Defaults = JapaneseDefaults
    def make_doc(self, text):
        return self.tokenizer(text)
 def pickle_japanese(instance):
    return Japanese, tuple()
--- a/spacy/lang/kn/init.py
+++ b/spacy/lang/kn/init.py
@ -1,17 +1,26 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class KannadaDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters[LANG] = lambda text: "kn"
+lang = "kn"
-    stop_words = STOP_WORDS
+stop_words = {"@language_data": "spacy.kn.stop_words"}
 """
@registry.language_data("spacy.kn.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class Kannada(Language):
    lang = "kn"
-    Defaults = KannadaDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Kannada"]
--- a/spacy/lang/ko/init.py
+++ b/spacy/lang/ko/init.py
@ -1,51 +1,52 @@
 from typing import Set, Optional, Any, Dict
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .tag_map import TAG_MAP
 from ...attrs import LANG
 from ...language import Language
 from ...tokens import Doc
 from ...compat import copy_reg
-from ...util import DummyTokenizer
+from ...util import DummyTokenizer, registry
-def try_mecab_import():
+DEFAULT_CONFIG = """
-    try:
+[nlp]
-        from natto import MeCab
+lang = "ko"
 stop_words = {"@language_data": "spacy.ko.stop_words"}
-        return MeCab
+[nlp.tokenizer]
-    except ImportError:
+@tokenizers = "spacy.KoreanTokenizer.v1"
-        raise ImportError(
+
-            "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
+[nlp.writing_system]
-            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
+direction = "ltr"
-            "and [natto-py](https://github.com/buruzaemon/natto-py)"
+has_case = false
-        )
+has_letters = false
 """
-# fmt: on
+@registry.language_data("spacy.ko.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
-def check_spaces(text, tokens):
+@registry.tokenizers("spacy.KoreanTokenizer.v1")
-    prev_end = -1
+def create_korean_tokenizer():
-    start = 0
+    def korean_tokenizer_factory(nlp):
-    for token in tokens:
+        return KoreanTokenizer(nlp)
-        idx = text.find(token, start)
+
-        if prev_end > 0:
+    return korean_tokenizer_factory
            yield prev_end != idx
        prev_end = idx + len(token)
        start = prev_end
    if start > 0:
        yield False
 class KoreanTokenizer(DummyTokenizer):
-    def __init__(self, cls, nlp=None):
+    def __init__(self, nlp: Optional[Language] = None):
-        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
+        self.vocab = nlp.vocab
        MeCab = try_mecab_import()
        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
    def __del__(self):
        self.mecab_tokenizer.__del__()
-    def __call__(self, text):
+    def __call__(self, text: str) -> Doc:
        dtokens = list(self.detailed_tokens(text))
        surfaces = [dt["surface"] for dt in dtokens]
        doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
@ -56,7 +57,7 @@ class KoreanTokenizer(DummyTokenizer):
        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
        return doc
-    def detailed_tokens(self, text):
+    def detailed_tokens(self, text: str) -> Dict[str, Any]:
        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
        # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
@ -72,23 +73,39 @@ class KoreanTokenizer(DummyTokenizer):
 class KoreanDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda _text: "ko"
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
    @classmethod
    def create_tokenizer(cls, nlp=None):
        return KoreanTokenizer(cls, nlp)
 class Korean(Language):
    lang = "ko"
    Defaults = KoreanDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
-    def make_doc(self, text):
+
-        return self.tokenizer(text)
+def try_mecab_import() -> None:
    try:
        from natto import MeCab
        return MeCab
    except ImportError:
        raise ImportError(
            "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
            "and [natto-py](https://github.com/buruzaemon/natto-py)"
        )
 def check_spaces(text, tokens):
    prev_end = -1
    start = 0
    for token in tokens:
        idx = text.find(token, start)
        if prev_end > 0:
            yield prev_end != idx
        prev_end = idx + len(token)
        start = prev_end
    if start > 0:
        yield False
 def pickle_korean(instance):
--- a/spacy/lang/lb/init.py
+++ b/spacy/lang/lb/init.py
@ -1,26 +1,49 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_INFIXES
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import update_exc, registry
-from ...util import update_exc
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "lb"
 stop_words = {"@language_data": "spacy.lb.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.lb.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.lb.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class LuxembourgishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "lb"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    infixes = TOKENIZER_INFIXES
 class Luxembourgish(Language):
    lang = "lb"
    Defaults = LuxembourgishDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Luxembourgish"]
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -1,3 +1,4 @@
 from typing import Set
 import unicodedata
 import re
@ -21,21 +22,21 @@ _tlds = set(
 )
-def is_punct(text):
+def is_punct(text: str) -> bool:
    for char in text:
        if not unicodedata.category(char).startswith("P"):
            return False
    return True
-def is_ascii(text):
+def is_ascii(text: str) -> bool:
    for char in text:
        if ord(char) >= 128:
            return False
    return True
-def like_num(text):
+def like_num(text: str) -> bool:
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    # can be overwritten by lang with list of number words
@ -49,64 +50,31 @@ def like_num(text):
    return False
-def is_bracket(text):
+def is_bracket(text: str) -> bool:
    brackets = ("(", ")", "[", "]", "{", "}", "<", ">")
    return text in brackets
-def is_quote(text):
+def is_quote(text: str) -> bool:
-    quotes = (
+    # fmt: off
-        '"',
+    quotes = ('"', "'", "`", "«", "»", "‘", "’", "‚", "‛", "“", "”", "„", "‟", "‹", "›", "❮", "❯", "''", "``")
-        "'",
+    # fmt: on
        "`",
        "«",
        "»",
        "‘",
        "’",
        "‚",
        "‛",
        "“",
        "”",
        "„",
        "‟",
        "‹",
        "›",
        "❮",
        "❯",
        "''",
        "``",
    )
    return text in quotes
-def is_left_punct(text):
+def is_left_punct(text: str) -> bool:
-    left_punct = (
+    # fmt: off
-        "(",
+    left_punct = ("(", "[", "{", "<", '"', "'", "«", "‘", "‚", "‛", "“", "„", "‟", "‹", "❮", "``")
-        "[",
+    # fmt: on
        "{",
        "<",
        '"',
        "'",
        "«",
        "‘",
        "‚",
        "‛",
        "“",
        "„",
        "‟",
        "‹",
        "❮",
        "``",
    )
    return text in left_punct
-def is_right_punct(text):
+def is_right_punct(text: str) -> bool:
    right_punct = (")", "]", "}", ">", '"', "'", "»", "’", "”", "›", "❯", "''")
    return text in right_punct
-def is_currency(text):
+def is_currency(text: str) -> bool:
    # can be overwritten by lang with list of currency words, e.g. dollar, euro
    for char in text:
        if unicodedata.category(char) != "Sc":
@ -114,11 +82,11 @@ def is_currency(text):
    return True
-def like_email(text):
+def like_email(text: str) -> bool:
    return bool(_like_email(text))
-def like_url(text):
+def like_url(text: str) -> bool:
    # We're looking for things that function in text like URLs. So, valid URL
    # or not, anything they say http:// is going to be good.
    if text.startswith("http://") or text.startswith("https://"):
@ -144,7 +112,7 @@ def like_url(text):
    return False
-def word_shape(text):
+def word_shape(text: str) -> str:
    if len(text) >= 100:
        return "LONG"
    shape = []
@ -171,46 +139,52 @@ def word_shape(text):
    return "".join(shape)
-def lower(string):
+def lower(string: str) -> str:
    return string.lower()
-def prefix(string):
+def prefix(string: str) -> str:
    return string[0]
-def suffix(string):
+def suffix(string: str) -> str:
    return string[-3:]
-def is_alpha(string):
+def is_alpha(string: str) -> bool:
    return string.isalpha()
-def is_digit(string):
+def is_digit(string: str) -> bool:
    return string.isdigit()
-def is_lower(string):
+def is_lower(string: str) -> bool:
    return string.islower()
-def is_space(string):
+def is_space(string: str) -> bool:
    return string.isspace()
-def is_title(string):
+def is_title(string: str) -> bool:
    return string.istitle()
-def is_upper(string):
+def is_upper(string: str) -> bool:
    return string.isupper()
-def is_stop(string, stops=set()):
+def is_stop(string: str, stops: Set[str] = set()) -> bool:
    return string.lower() in stops
 def get_lang(text: str, lang: str = "") -> str:
    # This function is partially applied so lang code can be passed in
    # automatically while still allowing pickling
    return lang
 LEX_ATTRS = {
    attrs.LOWER: lower,
    attrs.NORM: lower,
--- a/spacy/lang/lij/init.py
+++ b/spacy/lang/lij/init.py
@ -1,28 +1,35 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_INFIXES
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
-from ...util import update_exc, add_lookups
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "lij"
 stop_words = {"@language_data": "spacy.lij.stop_words"}
 """
@registry.language_data("spacy.lij.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class LigurianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "lij"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    infixes = TOKENIZER_INFIXES
 class Ligurian(Language):
    lang = "lij"
    Defaults = LigurianDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Ligurian"]
--- a/spacy/lang/lt/init.py
+++ b/spacy/lang/lt/init.py
@ -1,27 +1,41 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
 from ...util import update_exc, add_lookups
-def _return_lt(_):
+DEFAULT_CONFIG = """
-    return "lt"
+[nlp]
 lang = "lt"
 stop_words = {"@language_data": "spacy.lt.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.lt.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.lt.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class LithuanianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = _return_lt
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    lex_attr_getters.update(LEX_ATTRS)
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    mod_base_exceptions = {
@ -29,12 +43,12 @@ class LithuanianDefaults(Language.Defaults):
    }
    del mod_base_exceptions["8)"]
    tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
 class Lithuanian(Language):
    lang = "lt"
    Defaults = LithuanianDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Lithuanian"]
--- a/spacy/lang/lv/init.py
+++ b/spacy/lang/lv/init.py
@ -1,17 +1,26 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class LatvianDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters[LANG] = lambda text: "lv"
+lang = "lv"
-    stop_words = STOP_WORDS
+stop_words = {"@language_data": "spacy.lv.stop_words"}
 """
@registry.language_data("spacy.lv.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class Latvian(Language):
    lang = "lv"
-    Defaults = LatvianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Latvian"]
--- a/spacy/lang/ml/init.py
+++ b/spacy/lang/ml/init.py
@ -1,15 +1,26 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ...language import Language
 from ...util import registry
-class MalayalamDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    stop_words = STOP_WORDS
+[nlp]
 lang = "ml"
 stop_words = {"@language_data": "spacy.ml.stop_words"}
 """
@registry.language_data("spacy.ml.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class Malayalam(Language):
    lang = "ml"
-    Defaults = MalayalamDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Malayalam"]
--- a/spacy/lang/mr/init.py
+++ b/spacy/lang/mr/init.py
@ -1,17 +1,26 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class MarathiDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters[LANG] = lambda text: "mr"
+lang = "af"
-    stop_words = STOP_WORDS
+stop_words = {"@language_data": "spacy.mr.stop_words"}
 """
@registry.language_data("spacy.mr.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class Marathi(Language):
    lang = "mr"
-    Defaults = MarathiDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Marathi"]
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -1,33 +1,47 @@
 from typing import Set
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
-from ...util import update_exc, add_lookups
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "nb"
 stop_words = {"@language_data": "spacy.nb.stop_words"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.nb.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class NorwegianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "nb"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
 class Norwegian(Language):
    lang = "nb"
    Defaults = NorwegianDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Norwegian"]
--- a/spacy/lang/ne/init.py
+++ b/spacy/lang/ne/init.py
@ -1,23 +1,33 @@
-# coding: utf8
+from typing import Set, Dict, Callable, Any
-from __future__ import unicode_literals
+from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class NepaliDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters.update(LEX_ATTRS)
+lang = "ne"
-    lex_attr_getters[LANG] = lambda text: "ne" # Nepali language ISO code
+stop_words = {"@language_data": "spacy.ne.stop_words"}
-    stop_words = STOP_WORDS
+lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
 """
@registry.language_data("spacy.ne.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.ne.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class Nepali(Language):
    lang = "ne"
-    Defaults = NepaliDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Nepali"]
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -1,3 +1,6 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -5,36 +8,51 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .lemmatizer import DutchLemmatizer
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lookups import Lookups
+from ...util import update_exc, registry
-from ...attrs import LANG, NORM
+
-from ...util import update_exc, add_lookups
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "nl"
 stop_words = {"@language_data": "spacy.nl.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.DutchLemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.nl.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.nl.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
 def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer:
    return DutchLemmatizer(data_paths=data_paths)
 class DutchDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "nl"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
        if lookups is None:
            lookups = Lookups()
        return DutchLemmatizer(lookups)
 class Dutch(Language):
    lang = "nl"
    Defaults = DutchDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Dutch"]
--- a/spacy/lang/nl/lemmatizer.py
+++ b/spacy/lang/nl/lemmatizer.py
@ -1,3 +1,5 @@
 from typing import Optional, List, Dict, Tuple
 from ...lemmatizer import Lemmatizer
 from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
@ -34,7 +36,9 @@ class DutchLemmatizer(Lemmatizer):
        "num": "num",
    }
-    def __call__(self, string, univ_pos, morphology=None):
+    def __call__(
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
    ) -> List[str]:
        # Difference 1: self.rules is assumed to be non-None, so no
        # 'is None' check required.
        # String lowercased from the get-go. All lemmatization results in
@ -92,7 +96,7 @@ class DutchLemmatizer(Lemmatizer):
    # Overrides parent method so that a lowercased version of the string is
    # used to search the lookup table. This is necessary because our lookup
    # table consists entirely of lowercase keys.
-    def lookup(self, string, orth=None):
+    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        string = string.lower()
        if orth is not None:
@ -102,7 +106,13 @@ class DutchLemmatizer(Lemmatizer):
    # Reimplemented to focus more on application of suffix rules and to return
    # as early as possible.
-    def lemmatize(self, string, index, exceptions, rules):
+    def lemmatize(
        self,
        string: str,
        index: Dict[str, List[str]],
        exceptions: Dict[str, Dict[str, List[str]]],
        rules: Dict[str, List[List[str]]],
    ) -> Tuple[List[str], bool]:
        # returns (forms, is_known: bool)
        oov_forms = []
        for old, new in rules:
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -1,43 +1,60 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import PolishLemmatizer
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import registry
-from ...util import add_lookups
+
-from ...lookups import Lookups
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "pl"
 stop_words = {"@language_data": "spacy.pl.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.PolishLemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.pl.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.pl.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
@registry.lemmatizers("spacy.PolishLemmatizer.v1")
 def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer:
    return PolishLemmatizer(data_paths=data_paths)
 class PolishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "pl"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    mod_base_exceptions = {
        exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
    }
    tokenizer_exceptions = mod_base_exceptions
    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
        if lookups is None:
            lookups = Lookups()
        return PolishLemmatizer(lookups)
 class Polish(Language):
    lang = "pl"
    Defaults = PolishDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Polish"]
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@ -1,3 +1,5 @@
 from typing import Optional, List, Dict
 from ...lemmatizer import Lemmatizer
 from ...parts_of_speech import NAMES
@ -7,30 +9,29 @@ class PolishLemmatizer(Lemmatizer):
    # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
    # It utilizes some prefix based improvements for verb and adjectives
    # lemmatization, as well as case-sensitive lemmatization for nouns.
-    def __call__(self, string, univ_pos, morphology=None):
+    def __call__(
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
    ) -> List[str]:
        if isinstance(univ_pos, int):
            univ_pos = NAMES.get(univ_pos, "X")
        univ_pos = univ_pos.upper()
        lookup_pos = univ_pos.lower()
        if univ_pos == "PROPN":
            lookup_pos = "noun"
        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
        if univ_pos == "NOUN":
            return self.lemmatize_noun(string, morphology, lookup_table)
        if univ_pos != "PROPN":
            string = string.lower()
        if univ_pos == "ADJ":
            return self.lemmatize_adj(string, morphology, lookup_table)
        elif univ_pos == "VERB":
            return self.lemmatize_verb(string, morphology, lookup_table)
        return [lookup_table.get(string, string.lower())]
-    def lemmatize_adj(self, string, morphology, lookup_table):
+    def lemmatize_adj(
        self, string: str, morphology: dict, lookup_table: Dict[str, str]
    ) -> List[str]:
        # this method utilizes different procedures for adjectives
        # with 'nie' and 'naj' prefixes
        if string[:3] == "nie":
@ -41,25 +42,26 @@ class PolishLemmatizer(Lemmatizer):
                    return [lookup_table[naj_search_string]]
            if search_string in lookup_table:
                return [lookup_table[search_string]]
        if string[:3] == "naj":
            naj_search_string = string[3:]
            if naj_search_string in lookup_table:
                return [lookup_table[naj_search_string]]
        return [lookup_table.get(string, string)]
-    def lemmatize_verb(self, string, morphology, lookup_table):
+    def lemmatize_verb(
        self, string: str, morphology: dict, lookup_table: Dict[str, str]
    ) -> List[str]:
        # this method utilizes a different procedure for verbs
        # with 'nie' prefix
        if string[:3] == "nie":
            search_string = string[3:]
            if search_string in lookup_table:
                return [lookup_table[search_string]]
        return [lookup_table.get(string, string)]
-    def lemmatize_noun(self, string, morphology, lookup_table):
+    def lemmatize_noun(
        self, string: str, morphology: dict, lookup_table: Dict[str, str]
    ) -> List[str]:
        # this method is case-sensitive, in order to work
        # for incorrectly tagged proper names
        if string != string.lower():
@ -68,11 +70,16 @@ class PolishLemmatizer(Lemmatizer):
            elif string in lookup_table:
                return [lookup_table[string]]
            return [string.lower()]
        return [lookup_table.get(string, string)]
-    def lookup(self, string, orth=None):
+    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        return string.lower()
-    def lemmatize(self, string, index, exceptions, rules):
+    def lemmatize(
        self,
        string: str,
        index: Dict[str, List[str]],
        exceptions: Dict[str, Dict[str, List[str]]],
        rules: Dict[str, List[List[str]]],
    ) -> List[str]:
        raise NotImplementedError
--- a/spacy/lang/pt/init.py
+++ b/spacy/lang/pt/init.py
@ -1,20 +1,42 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 from ...language import Language
-from ...attrs import LANG
+from ...util import update_exc, registry
-from ...util import update_exc
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "pt"
 stop_words = {"@language_data": "spacy.pt.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.pt.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.pt.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class PortugueseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "pt"
    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    infixes = TOKENIZER_INFIXES
    prefixes = TOKENIZER_PREFIXES
@ -22,6 +44,7 @@ class PortugueseDefaults(Language.Defaults):
 class Portuguese(Language):
    lang = "pt"
    Defaults = PortugueseDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Portuguese"]
--- a/spacy/lang/ro/init.py
+++ b/spacy/lang/ro/init.py
@ -1,27 +1,40 @@
 from typing import Set
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
 from ...util import update_exc, add_lookups
 # Lemma data note:
 # Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
 # Replaced characters using cedillas with the correct ones (ș and ț)
 DEFAULT_CONFIG = """
 [nlp]
 lang = "ro"
 stop_words = {"@language_data": "spacy.ro.stop_words"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.ro.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class RomanianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "ro"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
@ -30,6 +43,7 @@ class RomanianDefaults(Language.Defaults):
 class Romanian(Language):
    lang = "ro"
    Defaults = RomanianDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Romanian"]
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -1,32 +1,49 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import RussianLemmatizer
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...util import update_exc
+from ...util import update_exc, registry
 from ...language import Language
-from ...lookups import Lookups
+
-from ...attrs import LANG
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "ru"
 stop_words = {"@language_data": "spacy.ru.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.RussianLemmatizer.v1"
 """
@registry.language_data("spacy.ru.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.ru.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
@registry.lemmatizers("spacy.RussianLemmatizer.v1")
 def create_russian_lemmatizer() -> RussianLemmatizer:
    return RussianLemmatizer()
 class RussianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "ru"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
        if lookups is None:
            lookups = Lookups()
        return RussianLemmatizer(lookups)
 class Russian(Language):
    lang = "ru"
    Defaults = RussianDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Russian"]
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -1,11 +1,17 @@
 from typing import Optional, Tuple, Dict, List
 from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
 from ...lemmatizer import Lemmatizer
 from ...lookups import Lookups
 PUNCT_RULES = {"«": '"', "»": '"'}
 class RussianLemmatizer(Lemmatizer):
    _morph = None
-    def __init__(self, lookups=None):
+    def __init__(self, lookups: Optional[Lookups] = None) -> None:
        super(RussianLemmatizer, self).__init__(lookups)
        try:
            from pymorphy2 import MorphAnalyzer
@ -19,15 +25,15 @@ class RussianLemmatizer(Lemmatizer):
        if RussianLemmatizer._morph is None:
            RussianLemmatizer._morph = MorphAnalyzer()
-    def __call__(self, string, univ_pos, morphology=None):
+    def __call__(
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
    ) -> List[str]:
        univ_pos = self.normalize_univ_pos(univ_pos)
        if univ_pos == "PUNCT":
            return [PUNCT_RULES.get(string, string)]
        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
            # Skip unchangeable pos
            return [string.lower()]
        analyses = self._morph.parse(string)
        filtered_analyses = []
        for analysis in analyses:
@ -39,12 +45,10 @@ class RussianLemmatizer(Lemmatizer):
                analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
            ):
                filtered_analyses.append(analysis)
        if not len(filtered_analyses):
            return [string.lower()]
        if morphology is None or (len(morphology) == 1 and POS in morphology):
            return list(set([analysis.normal_form for analysis in filtered_analyses]))
        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
            features_to_compare = ["Case", "Number", "Gender"]
        elif univ_pos == "NUM":
@ -61,7 +65,6 @@ class RussianLemmatizer(Lemmatizer):
                "VerbForm",
                "Voice",
            ]
        analyses, filtered_analyses = filtered_analyses, []
        for analysis in analyses:
            _, analysis_morph = oc2ud(str(analysis.tag))
@ -74,16 +77,14 @@ class RussianLemmatizer(Lemmatizer):
                    break
            else:
                filtered_analyses.append(analysis)
        if not len(filtered_analyses):
            return [string.lower()]
        return list(set([analysis.normal_form for analysis in filtered_analyses]))
    @staticmethod
-    def normalize_univ_pos(univ_pos):
+    def normalize_univ_pos(univ_pos: str) -> Optional[str]:
        if isinstance(univ_pos, str):
            return univ_pos.upper()
        symbols_to_str = {
            ADJ: "ADJ",
            DET: "DET",
@ -98,14 +99,14 @@ class RussianLemmatizer(Lemmatizer):
            return symbols_to_str[univ_pos]
        return None
-    def lookup(self, string, orth=None):
+    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        analyses = self._morph.parse(string)
        if len(analyses) == 1:
            return analyses[0].normal_form
        return string
-def oc2ud(oc_tag):
+def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
    gram_map = {
        "_POS": {
            "ADJF": "ADJ",
@ -160,11 +161,9 @@ def oc2ud(oc_tag):
        "Voice": {"actv": "Act", "pssv": "Pass"},
        "Abbr": {"Abbr": "Yes"},
    }
    pos = "X"
    morphology = dict()
    unmatched = set()
    grams = oc_tag.replace(" ", ",").split(",")
    for gram in grams:
        match = False
@ -177,7 +176,6 @@ def oc2ud(oc_tag):
                    morphology[categ] = gmap[gram]
        if not match:
            unmatched.add(gram)
    while len(unmatched) > 0:
        gram = unmatched.pop()
        if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
@ -186,8 +184,4 @@ def oc2ud(oc_tag):
            pos = "AUX"
        elif gram == "Pltm":
            morphology["Number"] = "Ptan"
    return pos, morphology
 PUNCT_RULES = {"«": '"', "»": '"'}
--- a/spacy/lang/si/init.py
+++ b/spacy/lang/si/init.py
@ -1,20 +1,33 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class SinhalaDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters.update(LEX_ATTRS)
+lang = "si"
-    lex_attr_getters[LANG] = lambda text: "si"
+stop_words = {"@language_data": "spacy.si.stop_words"}
-    stop_words = STOP_WORDS
+lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
 """
@registry.language_data("spacy.si.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.si.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class Sinhala(Language):
    lang = "si"
-    Defaults = SinhalaDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Sinhala"]
--- a/spacy/lang/sk/init.py
+++ b/spacy/lang/sk/init.py
@ -1,20 +1,33 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class SlovakDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters.update(LEX_ATTRS)
+lang = "sk"
-    lex_attr_getters[LANG] = lambda text: "sk"
+stop_words = {"@language_data": "spacy.sk.stop_words"}
-    stop_words = STOP_WORDS
+lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
 """
@registry.language_data("spacy.sk.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.sk.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class Slovak(Language):
    lang = "sk"
-    Defaults = SlovakDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Slovak"]
--- a/spacy/lang/sl/init.py
+++ b/spacy/lang/sl/init.py
@ -1,17 +1,26 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class SlovenianDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters[LANG] = lambda text: "sl"
+lang = "sl"
-    stop_words = STOP_WORDS
+stop_words = {"@language_data": "spacy.sl.stop_words"}
 """
@registry.language_data("spacy.sl.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class Slovenian(Language):
    lang = "sl"
-    Defaults = SlovenianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Slovenian"]
--- a/spacy/lang/sq/init.py
+++ b/spacy/lang/sq/init.py
@ -1,17 +1,26 @@
 from typing import Set
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class AlbanianDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters[LANG] = lambda text: "sq"
+lang = "sq"
-    stop_words = STOP_WORDS
+stop_words = {"@language_data": "spacy.sq.stop_words"}
 """
@registry.language_data("spacy.sq.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class Albanian(Language):
    lang = "sq"
-    Defaults = AlbanianDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Albanian"]
--- a/spacy/lang/sr/init.py
+++ b/spacy/lang/sr/init.py
@ -1,23 +1,47 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import update_exc, registry
-from ...util import update_exc
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "sr"
 stop_words = {"@language_data": "spacy.sr.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.sr.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.sr.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class SerbianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "sr"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
 class Serbian(Language):
    lang = "sr"
    Defaults = SerbianDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Serbian"]
--- a/spacy/lang/sv/init.py
+++ b/spacy/lang/sv/init.py
@ -1,35 +1,54 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
 from ...util import update_exc, registry
 from .syntax_iterators import SYNTAX_ITERATORS
 # Punctuation stolen from Danish
 from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
+
-from ..norm_exceptions import BASE_NORMS
+DEFAULT_CONFIG = """
-from ...language import Language
+[nlp]
-from ...attrs import LANG, NORM
+lang = "sv"
-from ...util import update_exc, add_lookups
+stop_words = {"@language_data": "spacy.sv.stop_words"}
-from .syntax_iterators import SYNTAX_ITERATORS
+lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.sv.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.sv.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class SwedishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "sv"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
 class Swedish(Language):
    lang = "sv"
    Defaults = SwedishDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Swedish"]
--- a/spacy/lang/ta/init.py
+++ b/spacy/lang/ta/init.py
@ -1,20 +1,33 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class TamilDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters[LANG] = lambda text: "ta"
+lang = "ta"
-    lex_attr_getters.update(LEX_ATTRS)
+stop_words = {"@language_data": "spacy.ta.stop_words"}
-    stop_words = STOP_WORDS
+lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
 """
@registry.language_data("spacy.ta.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.ta.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class Tamil(Language):
    lang = "ta"
-    Defaults = TamilDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Tamil"]
--- a/spacy/lang/te/init.py
+++ b/spacy/lang/te/init.py
@ -1,20 +1,33 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
-class TeluguDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters.update(LEX_ATTRS)
+lang = "te"
-    lex_attr_getters[LANG] = lambda text: "te"
+stop_words = {"@language_data": "spacy.te.stop_words"}
-    stop_words = STOP_WORDS
+lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
 """
@registry.language_data("spacy.te.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.te.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class Telugu(Language):
    lang = "te"
-    Defaults = TeluguDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Telugu"]
--- a/spacy/lang/th/init.py
+++ b/spacy/lang/th/init.py
@ -1,15 +1,44 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...attrs import LANG
 from ...language import Language
 from ...tokens import Doc
-from ...util import DummyTokenizer
+from ...util import DummyTokenizer, registry
 DEFAULT_CONFIG = """
 [nlp]
 lang = "th"
 stop_words = {"@language_data": "spacy.th.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
 [nlp.tokenizer]
@tokenizers = "spacy.ThaiTokenizer.v1"
 """
@registry.language_data("spacy.th.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.th.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
@registry.tokenizers("spacy.ThaiTokenizer.v1")
 def create_thai_tokenizer():
    def thai_tokenizer_factory(nlp):
        return ThaiTokenizer(nlp)
    return thai_tokenizer_factory
 class ThaiTokenizer(DummyTokenizer):
-    def __init__(self, cls, nlp=None):
+    def __init__(self, nlp: Language) -> None:
        try:
            from pythainlp.tokenize import word_tokenize
        except ImportError:
@ -17,34 +46,18 @@ class ThaiTokenizer(DummyTokenizer):
                "The Thai tokenizer requires the PyThaiNLP library: "
                "https://github.com/PyThaiNLP/pythainlp"
            )
        self.word_tokenize = word_tokenize
-        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
+        self.vocab = nlp.vocab
-    def __call__(self, text):
+    def __call__(self, text: str) -> Doc:
        words = list(self.word_tokenize(text))
        spaces = [False] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)
 class ThaiDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda _text: "th"
    tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    @classmethod
    def create_tokenizer(cls, nlp=None):
        return ThaiTokenizer(cls, nlp)
 class Thai(Language):
    lang = "th"
-    Defaults = ThaiDefaults
+    default_config = Config().from_str(DEFAULT_CONFIG)
    def make_doc(self, text):
        return self.tokenizer(text)
 __all__ = ["Thai"]
--- a/spacy/lang/tl/init.py
+++ b/spacy/lang/tl/init.py
@ -1,31 +1,47 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
 from ...util import update_exc, add_lookups
-def _return_tl(_):
+DEFAULT_CONFIG = """
-    return "tl"
+[nlp]
 lang = "tl"
 stop_words = {"@language_data": "spacy.tl.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.tl.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.tl.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class TagalogDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = _return_tl
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
 class Tagalog(Language):
    lang = "tl"
    Defaults = TagalogDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Tagalog"]
--- a/spacy/lang/tr/init.py
+++ b/spacy/lang/tr/init.py
@ -1,26 +1,40 @@
 from typing import Set
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+from ...util import update_exc, registry
-from ...util import update_exc, add_lookups
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "tr"
 stop_words = {"@language_data": "spacy.tr.stop_words"}
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.tr.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
 class TurkishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "tr"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
 class Turkish(Language):
    lang = "tr"
    Defaults = TurkishDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Turkish"]
--- a/spacy/lang/tt/init.py
+++ b/spacy/lang/tt/init.py
@ -1,28 +1,42 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...attrs import LANG
 from ...language import Language
-from ...util import update_exc
+from ...util import update_exc, registry
 DEFAULT_CONFIG = """
 [nlp]
 lang = "tt"
 stop_words = {"@language_data": "spacy.tt.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"}
 """
@registry.language_data("spacy.tt.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.tt.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class TatarDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "tt"
    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    infixes = tuple(TOKENIZER_INFIXES)
    stop_words = STOP_WORDS
 class Tatar(Language):
    lang = "tt"
    Defaults = TatarDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Tatar"]
--- a/spacy/lang/uk/init.py
+++ b/spacy/lang/uk/init.py
@ -1,36 +1,49 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
+from ...util import update_exc, registry
 from ...util import update_exc, add_lookups
 from ...language import Language
 from ...lookups import Lookups
 from ...attrs import LANG, NORM
 from .lemmatizer import UkrainianLemmatizer
-class UkrainianDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
-    lex_attr_getters[LANG] = lambda text: "uk"
+lang = "uk"
-    lex_attr_getters[NORM] = add_lookups(
+stop_words = {"@language_data": "spacy.uk.stop_words"}
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
+lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"}
    )
    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
-    @classmethod
+[nlp.lemmatizer]
-    def create_lemmatizer(cls, nlp=None, lookups=None):
+@lemmatizers = "spacy.UkrainianLemmatizer.v1"
-        if lookups is None:
+"""
-            lookups = Lookups()
+
-        return UkrainianLemmatizer(lookups)
+
@registry.language_data("spacy.uk.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.uk.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
@registry.lemmatizers("spacy.UkrainianLemmatizer.v1")
 def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
    return UkrainianLemmatizer()
 class UkrainianDefaults(Language.Defaults):
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 class Ukrainian(Language):
    lang = "uk"
    Defaults = UkrainianDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Ukrainian"]
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -1,11 +1,17 @@
 from typing import Optional, List, Tuple, Dict
 from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
 from ...lookups import Lookups
 from ...lemmatizer import Lemmatizer
 PUNCT_RULES = {"«": '"', "»": '"'}
 class UkrainianLemmatizer(Lemmatizer):
    _morph = None
-    def __init__(self, lookups=None):
+    def __init__(self, lookups: Optional[Lookups] = None) -> None:
        super(UkrainianLemmatizer, self).__init__(lookups)
        try:
            from pymorphy2 import MorphAnalyzer
@ -19,15 +25,15 @@ class UkrainianLemmatizer(Lemmatizer):
                '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
            )
-    def __call__(self, string, univ_pos, morphology=None):
+    def __call__(
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
    ) -> List[str]:
        univ_pos = self.normalize_univ_pos(univ_pos)
        if univ_pos == "PUNCT":
            return [PUNCT_RULES.get(string, string)]
        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
            # Skip unchangeable pos
            return [string.lower()]
        analyses = self._morph.parse(string)
        filtered_analyses = []
        for analysis in analyses:
@ -39,12 +45,10 @@ class UkrainianLemmatizer(Lemmatizer):
                analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
            ):
                filtered_analyses.append(analysis)
        if not len(filtered_analyses):
            return [string.lower()]
        if morphology is None or (len(morphology) == 1 and POS in morphology):
            return list(set([analysis.normal_form for analysis in filtered_analyses]))
        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
            features_to_compare = ["Case", "Number", "Gender"]
        elif univ_pos == "NUM":
@ -61,7 +65,6 @@ class UkrainianLemmatizer(Lemmatizer):
                "VerbForm",
                "Voice",
            ]
        analyses, filtered_analyses = filtered_analyses, []
        for analysis in analyses:
            _, analysis_morph = oc2ud(str(analysis.tag))
@ -74,16 +77,14 @@ class UkrainianLemmatizer(Lemmatizer):
                    break
            else:
                filtered_analyses.append(analysis)
        if not len(filtered_analyses):
            return [string.lower()]
        return list(set([analysis.normal_form for analysis in filtered_analyses]))
    @staticmethod
-    def normalize_univ_pos(univ_pos):
+    def normalize_univ_pos(univ_pos: str) -> Optional[str]:
        if isinstance(univ_pos, str):
            return univ_pos.upper()
        symbols_to_str = {
            ADJ: "ADJ",
            DET: "DET",
@ -98,14 +99,14 @@ class UkrainianLemmatizer(Lemmatizer):
            return symbols_to_str[univ_pos]
        return None
-    def lookup(self, string, orth=None):
+    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        analyses = self._morph.parse(string)
        if len(analyses) == 1:
            return analyses[0].normal_form
        return string
-def oc2ud(oc_tag):
+def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
    gram_map = {
        "_POS": {
            "ADJF": "ADJ",
@ -160,11 +161,9 @@ def oc2ud(oc_tag):
        "Voice": {"actv": "Act", "pssv": "Pass"},
        "Abbr": {"Abbr": "Yes"},
    }
    pos = "X"
    morphology = dict()
    unmatched = set()
    grams = oc_tag.replace(" ", ",").split(",")
    for gram in grams:
        match = False
@ -177,7 +176,6 @@ def oc2ud(oc_tag):
                    morphology[categ] = gmap[gram]
        if not match:
            unmatched.add(gram)
    while len(unmatched) > 0:
        gram = unmatched.pop()
        if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
@ -186,8 +184,4 @@ def oc2ud(oc_tag):
            pos = "AUX"
        elif gram == "Pltm":
            morphology["Number"] = "Ptan"
    return pos, morphology
 PUNCT_RULES = {"«": '"', "»": '"'}
--- a/spacy/lang/ur/init.py
+++ b/spacy/lang/ur/init.py
@ -1,26 +1,53 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
 DEFAULT_CONFIG = """
 [nlp]
 lang = "ur"
 stop_words = {"@language_data": "spacy.ur.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"}
 [nlp.writing_system]
 direction = "rtl"
 has_case = false
 has_letters = true
 [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
 [nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
 """
@registry.language_data("spacy.ur.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.ur.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class UrduDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "ur"
    tokenizer_exceptions = BASE_EXCEPTIONS
    stop_words = STOP_WORDS
    suffixes = TOKENIZER_SUFFIXES
    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 class Urdu(Language):
    lang = "ur"
    Defaults = UrduDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Urdu"]
--- a/spacy/lang/vi/init.py
+++ b/spacy/lang/vi/init.py
@ -1,38 +1,62 @@
-from ...attrs import LANG, NORM
+from typing import Set, Dict, Callable, Any
-from ..norm_exceptions import BASE_NORMS
+from thinc.api import Config
 from ...language import Language
 from ...tokens import Doc
 from .stop_words import STOP_WORDS
-from ...util import add_lookups
+from ...util import DummyTokenizer, registry
 from .lex_attrs import LEX_ATTRS
-class VietnameseDefaults(Language.Defaults):
+DEFAULT_CONFIG = """
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+[nlp]
    lex_attr_getters[LANG] = lambda text: "vi"  # for pickling
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    lex_attr_getters.update(LEX_ATTRS)
    stop_words = STOP_WORDS
    use_pyvi = True
 class Vietnamese(Language):
 lang = "vi"
-    Defaults = VietnameseDefaults  # override defaults
+stop_words = {"@language_data": "spacy.vi.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"}
-    def make_doc(self, text):
+[nlp.tokenizer]
-        if self.Defaults.use_pyvi:
+@tokenizers = "spacy.VietnameseTokenizer.v1"
 use_pyvi = true
 """
@registry.language_data("spacy.vi.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.vi.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
@registry.tokenizers("spacy.VietnameseTokenizer.v1")
 def create_vietnamese_tokenizer(use_pyvi: bool = True,):
    def vietnamese_tokenizer_factory(nlp):
        return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
    return vietnamese_tokenizer_factory
 class VietnameseTokenizer(DummyTokenizer):
    def __init__(self, nlp: Language, use_pyvi: bool = False):
        self.vocab = nlp.vocab
        self.use_pyvi = use_pyvi
        if self.use_pyvi:
            try:
                from pyvi import ViTokenizer
                self.ViTokenizer = ViTokenizer
            except ImportError:
                msg = (
-                    "Pyvi not installed. Either set Vietnamese.use_pyvi = False, "
+                    "Pyvi not installed. Either set use_pyvi = False, "
                    "or install it https://pypi.python.org/pypi/pyvi"
                )
                raise ImportError(msg)
-            words, spaces = ViTokenizer.spacy_tokenize(text)
+
    def __call__(self, text: str) -> Doc:
        if self.use_pyvi:
            words, spaces = self.ViTokenizer.spacy_tokenize(text)
            return Doc(self.vocab, words=words, spaces=spaces)
        else:
            words = []
@ -44,4 +68,9 @@ class Vietnamese(Language):
            return Doc(self.vocab, words=words, spaces=spaces)
 class Vietnamese(Language):
    lang = "vi"
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Vietnamese"]
--- a/spacy/lang/xx/init.py
+++ b/spacy/lang/xx/init.py
@ -1,17 +1,17 @@
 from thinc.api import Config
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
+
-from ...util import update_exc, add_lookups
+
 DEFAULT_CONFIG = """
 [nlp]
 lang = "xx"
 """
 class MultiLanguageDefaults(Language.Defaults):
-    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    tokenizer_exceptions = BASE_EXCEPTIONS
    lex_attr_getters[LANG] = lambda text: "xx"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
 class MultiLanguage(Language):
@ -21,6 +21,7 @@ class MultiLanguage(Language):
    lang = "xx"
    Defaults = MultiLanguageDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["MultiLanguage"]
--- a/spacy/lang/yo/init.py
+++ b/spacy/lang/yo/init.py
@ -1,21 +1,39 @@
 from typing import Set, Dict, Callable, Any
 from thinc.api import Config
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...attrs import LANG
+from ...util import registry
 DEFAULT_CONFIG = """
 [nlp]
 lang = "si"
 stop_words = {"@language_data": "spacy.yo.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"}
 """
@registry.language_data("spacy.yo.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.yo.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
 class YorubaDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "yo"
    stop_words = STOP_WORDS
    tokenizer_exceptions = BASE_EXCEPTIONS
 class Yoruba(Language):
    lang = "yo"
    Defaults = YorubaDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
 __all__ = ["Yoruba"]
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -1,13 +1,15 @@
 from typing import Optional, List, Set, Dict, Callable, Any
 from enum import Enum
 import tempfile
 import srsly
 import warnings
 from pathlib import Path
-from collections import OrderedDict
+from thinc.api import Config
-from ...attrs import LANG
+
 from ...errors import Warnings, Errors
 from ...language import Language
 from ...tokens import Doc
-from ...util import DummyTokenizer
+from ...util import DummyTokenizer, registry
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
@ -16,88 +18,103 @@ from ... import util
 _PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
 DEFAULT_CONFIG = """
 [nlp]
 lang = "zh"
 stop_words = {"@language_data": "spacy.zh.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"}
-def try_jieba_import(segmenter):
+[nlp.tokenizer]
-    try:
+@tokenizers = "spacy.ChineseTokenizer.v1"
-        import jieba
+segmenter = "char"
 pkuseg_model = null
 pkuseg_user_dict = "default"
-        if segmenter == "jieba":
+[nlp.writing_system]
-            # segment a short text to have jieba initialize its cache in advance
+direction = "ltr"
-            list(jieba.cut("作为", cut_all=False))
+has_case = false
 has_letters = false
 """
-        return jieba
+
-    except ImportError:
+class Segmenter(str, Enum):
-        if segmenter == "jieba":
+    char = "char"
-            msg = (
+    jieba = "jieba"
-                "Jieba not installed. To use jieba, install it with `pip "
+    pkuseg = "pkuseg"
-                " install jieba` or from https://github.com/fxsjy/jieba"
+
    @classmethod
    def values(cls):
        return list(cls.__members__.keys())
@registry.language_data("spacy.zh.stop_words")
 def stop_words() -> Set[str]:
    return STOP_WORDS
@registry.language_data("spacy.zh.lex_attr_getters")
 def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
    return LEX_ATTRS
@registry.tokenizers("spacy.ChineseTokenizer.v1")
 def create_chinese_tokenizer(
    segmenter: Segmenter = Segmenter.char,
    pkuseg_model: Optional[str] = None,
    pkuseg_user_dict: Optional[str] = "default",
 ):
    def chinese_tokenizer_factory(nlp):
        return ChineseTokenizer(
            nlp,
            segmenter=segmenter,
            pkuseg_model=pkuseg_model,
            pkuseg_user_dict=pkuseg_user_dict,
        )
            raise ImportError(msg)
-
+    return chinese_tokenizer_factory
 def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict):
    try:
        import pkuseg
        if pkuseg_model:
            return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
        elif segmenter == "pkuseg":
            msg = (
                "The Chinese word segmenter is 'pkuseg' but no pkuseg model "
                "was specified. Please provide the name of a pretrained model "
                "or the path to a model with "
                '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
                'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`'
            )
            raise ValueError(msg)
    except ImportError:
        if segmenter == "pkuseg":
            msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
            raise ImportError(msg)
    except FileNotFoundError:
        if segmenter == "pkuseg":
            msg = "Unable to load pkuseg model from: " + pkuseg_model
            raise FileNotFoundError(msg)
 class ChineseTokenizer(DummyTokenizer):
-    def __init__(self, cls, nlp=None, config={}):
+    def __init__(
-        self.supported_segmenters = ("char", "jieba", "pkuseg")
+        self,
-        self.configure_segmenter(config)
+        nlp: Language,
-        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
+        segmenter: Segmenter = Segmenter.char,
-        # remove relevant settings from config so they're not also saved in
+        pkuseg_model: Optional[str] = None,
-        # Language.meta
+        pkuseg_user_dict: Optional[str] = None,
-        for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]:
+    ):
-            if key in config:
+        self.vocab = nlp.vocab
-                del config[key]
+        if isinstance(segmenter, Segmenter):  # we might have the Enum here
-        self.tokenizer = Language.Defaults().create_tokenizer(nlp)
+            segmenter = segmenter.value
        self.segmenter = segmenter
        self.pkuseg_model = pkuseg_model
        self.pkuseg_user_dict = pkuseg_user_dict
        self.pkuseg_seg = None
        self.jieba_seg = None
        self.configure_segmenter(segmenter)
-    def configure_segmenter(self, config):
+    def configure_segmenter(self, segmenter: str):
-        self.segmenter = "char"
+        if segmenter not in Segmenter.values():
        if "segmenter" in config:
            if config["segmenter"] in self.supported_segmenters:
                self.segmenter = config["segmenter"]
            else:
            warn_msg = Warnings.W103.format(
                lang="Chinese",
-                    segmenter=config["segmenter"],
+                segmenter=segmenter,
-                    supported=", ".join([repr(s) for s in self.supported_segmenters]),
+                supported=", ".join(Segmenter.values()),
                default="'char' (character segmentation)",
            )
            warnings.warn(warn_msg)
            self.segmenter = Segmenter.char
        self.jieba_seg = try_jieba_import(self.segmenter)
        self.pkuseg_seg = try_pkuseg_import(
            self.segmenter,
-            pkuseg_model=config.get("pkuseg_model", None),
+            pkuseg_model=self.pkuseg_model,
-            pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
+            pkuseg_user_dict=self.pkuseg_user_dict,
        )
-    def __call__(self, text):
+    def __call__(self, text: str) -> Doc:
-        if self.segmenter == "jieba":
+        if self.segmenter == Segmenter.jieba:
            words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
            (words, spaces) = util.get_words_and_spaces(words, text)
            return Doc(self.vocab, words=words, spaces=spaces)
-        elif self.segmenter == "pkuseg":
+        elif self.segmenter == Segmenter.pkuseg:
            if self.pkuseg_seg is None:
                raise ValueError(Errors.E1000)
            words = self.pkuseg_seg.cut(text)
@ -105,11 +122,11 @@ class ChineseTokenizer(DummyTokenizer):
            return Doc(self.vocab, words=words, spaces=spaces)
        # warn if segmenter setting is not the only remaining option "char"
-        if self.segmenter != "char":
+        if self.segmenter != Segmenter.char:
            warn_msg = Warnings.W103.format(
                lang="Chinese",
                segmenter=self.segmenter,
-                supported=", ".join([repr(s) for s in self.supported_segmenters]),
+                supported=", ".join(Segmenter.values()),
                default="'char' (character segmentation)",
            )
            warnings.warn(warn_msg)
@ -119,15 +136,14 @@ class ChineseTokenizer(DummyTokenizer):
        (words, spaces) = util.get_words_and_spaces(words, text)
        return Doc(self.vocab, words=words, spaces=spaces)
-    def pkuseg_update_user_dict(self, words, reset=False):
+    def pkuseg_update_user_dict(self, words: List[str], reset: bool = False):
-        if self.segmenter == "pkuseg":
+        if self.segmenter == Segmenter.pkuseg:
            if reset:
                try:
                    import pkuseg
                    self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
                except ImportError:
                    if self.segmenter == "pkuseg":
                    msg = (
                        "pkuseg not installed: unable to reset pkuseg "
                        "user dict. Please " + _PKUSEG_INSTALL_MSG
@ -139,13 +155,6 @@ class ChineseTokenizer(DummyTokenizer):
            warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
            warnings.warn(warn_msg)
    def _get_config(self):
        config = OrderedDict((("segmenter", self.segmenter),))
        return config
    def _set_config(self, config={}):
        self.configure_segmenter(config)
    def to_bytes(self, **kwargs):
        pkuseg_features_b = b""
        pkuseg_weights_b = b""
@ -165,17 +174,11 @@ class ChineseTokenizer(DummyTokenizer):
                sorted(list(self.pkuseg_seg.postprocesser.common_words)),
                sorted(list(self.pkuseg_seg.postprocesser.other_words)),
            )
-        serializers = OrderedDict(
+        serializers = {
-            (
+            "pkuseg_features": lambda: pkuseg_features_b,
-                ("cfg", lambda: srsly.json_dumps(self._get_config())),
+            "pkuseg_weights": lambda: pkuseg_weights_b,
-                ("pkuseg_features", lambda: pkuseg_features_b),
+            "pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
-                ("pkuseg_weights", lambda: pkuseg_weights_b),
+        }
                (
                    "pkuseg_processors",
                    lambda: srsly.msgpack_dumps(pkuseg_processors_data),
                ),
            )
        )
        return util.to_bytes(serializers, [])
    def from_bytes(self, data, **kwargs):
@ -190,14 +193,11 @@ class ChineseTokenizer(DummyTokenizer):
        def deserialize_pkuseg_processors(b):
            pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
-        deserializers = OrderedDict(
+        deserializers = {
-            (
+            "pkuseg_features": deserialize_pkuseg_features,
-                ("cfg", lambda b: self._set_config(srsly.json_loads(b))),
+            "pkuseg_weights": deserialize_pkuseg_weights,
-                ("pkuseg_features", deserialize_pkuseg_features),
+            "pkuseg_processors": deserialize_pkuseg_processors,
-                ("pkuseg_weights", deserialize_pkuseg_weights),
+        }
                ("pkuseg_processors", deserialize_pkuseg_processors),
            )
        )
        util.from_bytes(data, deserializers, [])
        if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
@ -245,13 +245,10 @@ class ChineseTokenizer(DummyTokenizer):
                )
                srsly.write_msgpack(path, data)
-        serializers = OrderedDict(
+        serializers = {
-            (
+            "pkuseg_model": lambda p: save_pkuseg_model(p),
-                ("cfg", lambda p: srsly.write_json(p, self._get_config())),
+            "pkuseg_processors": lambda p: save_pkuseg_processors(p),
-                ("pkuseg_model", lambda p: save_pkuseg_model(p)),
+        }
                ("pkuseg_processors", lambda p: save_pkuseg_processors(p)),
            )
        )
        return util.to_disk(path, serializers, [])
    def from_disk(self, path, **kwargs):
@ -261,7 +258,7 @@ class ChineseTokenizer(DummyTokenizer):
            try:
                import pkuseg
            except ImportError:
-                if self.segmenter == "pkuseg":
+                if self.segmenter == Segmenter.pkuseg:
                    raise ImportError(
                        "pkuseg not installed. To use this model, "
                        + _PKUSEG_INSTALL_MSG
@ -273,9 +270,9 @@ class ChineseTokenizer(DummyTokenizer):
            try:
                import pkuseg
            except ImportError:
-                if self.segmenter == "pkuseg":
+                if self.segmenter == Segmenter.pkuseg:
                    raise ImportError(self._pkuseg_install_msg)
-            if self.segmenter == "pkuseg":
+            if self.segmenter == Segmenter.pkuseg:
                data = srsly.read_msgpack(path)
                (user_dict, do_process, common_words, other_words) = data
                self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
@ -283,35 +280,64 @@ class ChineseTokenizer(DummyTokenizer):
                self.pkuseg_seg.postprocesser.common_words = set(common_words)
                self.pkuseg_seg.postprocesser.other_words = set(other_words)
-        serializers = OrderedDict(
+        serializers = {
-            (
+            "pkuseg_model": lambda p: load_pkuseg_model(p),
-                ("cfg", lambda p: self._set_config(srsly.read_json(p))),
+            "pkuseg_processors": lambda p: load_pkuseg_processors(p),
-                ("pkuseg_model", lambda p: load_pkuseg_model(p)),
+        }
                ("pkuseg_processors", lambda p: load_pkuseg_processors(p)),
            )
        )
        util.from_disk(path, serializers, [])
 class ChineseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "zh"
    tokenizer_exceptions = BASE_EXCEPTIONS
    stop_words = STOP_WORDS
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
    @classmethod
    def create_tokenizer(cls, nlp=None, config={}):
        return ChineseTokenizer(cls, nlp, config=config)
 class Chinese(Language):
    lang = "zh"
-    Defaults = ChineseDefaults  # override defaults
+    Defaults = ChineseDefaults
    default_config = Config().from_str(DEFAULT_CONFIG)
-    def make_doc(self, text):
+
-        return self.tokenizer(text)
+def try_jieba_import(segmenter: str) -> None:
    try:
        import jieba
        if segmenter == Segmenter.jieba:
            # segment a short text to have jieba initialize its cache in advance
            list(jieba.cut("作为", cut_all=False))
        return jieba
    except ImportError:
        if segmenter == Segmenter.jieba:
            msg = (
                "Jieba not installed. To use jieba, install it with `pip "
                " install jieba` or from https://github.com/fxsjy/jieba"
            )
            raise ImportError(msg)
 def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
    try:
        import pkuseg
        if pkuseg_model:
            return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
        elif segmenter == Segmenter.pkuseg:
            msg = (
                "The Chinese word segmenter is 'pkuseg' but no pkuseg model "
                "was specified. Please provide the name of a pretrained model "
                "or the path to a model with:\n"
                'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
                "nlp = Chinese.from_config(cfg)"
            )
            raise ValueError(msg)
    except ImportError:
        if segmenter == Segmenter.pkuseg:
            msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
            raise ImportError(msg)
    except FileNotFoundError:
        if segmenter == Segmenter.pkuseg:
            msg = "Unable to load pkuseg model from: " + pkuseg_model
            raise FileNotFoundError(msg)
 def _get_pkuseg_trie_data(node, path=""):
--- a/spacy/language.py
+++ b/spacy/language.py
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -1,5 +1,14 @@
 from typing import Optional, Callable, List, Dict
 from .lookups import Lookups
 from .errors import Errors
 from .parts_of_speech import NAMES as UPOS_NAMES
 from .util import registry, load_language_data, SimpleFrozenDict
@registry.lemmatizers("spacy.Lemmatizer.v1")
 def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
    return Lemmatizer(data_paths=data_paths)
 class Lemmatizer:
@ -14,17 +23,27 @@ class Lemmatizer:
    def load(cls, *args, **kwargs):
        raise NotImplementedError(Errors.E172)
-    def __init__(self, lookups, is_base_form=None):
+    def __init__(
        self,
        lookups: Optional[Lookups] = None,
        data_paths: dict = SimpleFrozenDict(),
        is_base_form: Optional[Callable] = None,
    ) -> None:
        """Initialize a Lemmatizer.
        lookups (Lookups): The lookups object containing the (optional) tables
            "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
        RETURNS (Lemmatizer): The newly constructed object.
        """
-        self.lookups = lookups
+        self.lookups = lookups if lookups is not None else Lookups()
        for name, filename in data_paths.items():
            data = load_language_data(filename)
            self.lookups.add_table(name, data)
        self.is_base_form = is_base_form
-    def __call__(self, string, univ_pos, morphology=None):
+    def __call__(
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
    ) -> List[str]:
        """Lemmatize a string.
        string (str): The string to lemmatize, e.g. the token text.
@ -39,7 +58,6 @@ class Lemmatizer:
        if isinstance(univ_pos, int):
            univ_pos = UPOS_NAMES.get(univ_pos, "X")
        univ_pos = univ_pos.lower()
        if univ_pos in ("", "eol", "space"):
            return [string.lower()]
        # See Issue #435 for example of where this logic is requied.
@ -67,65 +85,31 @@ class Lemmatizer:
        )
        return lemmas
-    def is_base_form(self, univ_pos, morphology=None):
+    def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        """
        Check whether we're dealing with an uninflected paradigm, so we can
        avoid lemmatization entirely.
        univ_pos (str / int): The token's universal part-of-speech tag.
        morphology (dict): The token's morphological features following the
            Universal Dependencies scheme.
        """
        if morphology is None:
            morphology = {}
        if univ_pos == "noun" and morphology.get("Number") == "sing":
            return True
        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
            return True
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
        # morphology
        elif univ_pos == "verb" and (
            morphology.get("VerbForm") == "fin"
            and morphology.get("Tense") == "pres"
            and morphology.get("Number") is None
        ):
            return True
        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
            return True
        elif morphology.get("VerbForm") == "inf":
            return True
        elif morphology.get("VerbForm") == "none":
            return True
        elif morphology.get("Degree") == "pos":
            return True
        else:
            return False
    def noun(self, string, morphology=None):
        return self(string, "noun", morphology)
-    def verb(self, string, morphology=None):
+    def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "verb", morphology)
-    def adj(self, string, morphology=None):
+    def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "adj", morphology)
-    def det(self, string, morphology=None):
+    def det(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "det", morphology)
-    def pron(self, string, morphology=None):
+    def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "pron", morphology)
-    def adp(self, string, morphology=None):
+    def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "adp", morphology)
-    def num(self, string, morphology=None):
+    def num(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "num", morphology)
-    def punct(self, string, morphology=None):
+    def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]:
        return self(string, "punct", morphology)
-    def lookup(self, string, orth=None):
+    def lookup(self, string: str, orth: Optional[int] = None) -> str:
        """Look up a lemma in the table, if available. If no lemma is found,
        the original string is returned.
@ -141,7 +125,13 @@ class Lemmatizer:
            return lookup_table[key]
        return string
-    def lemmatize(self, string, index, exceptions, rules):
+    def lemmatize(
        self,
        string: str,
        index: Dict[str, List[str]],
        exceptions: Dict[str, Dict[str, List[str]]],
        rules: Dict[str, List[List[str]]],
    ) -> List[str]:
        orig = string
        string = string.lower()
        forms = []
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -1,15 +1,32 @@
 from typing import Dict, Any, List, Union, Optional
 from pathlib import Path
 import srsly
 from preshed.bloom import BloomFilter
 from collections import OrderedDict
 from .errors import Errors
-from .util import SimpleFrozenDict, ensure_path
+from .util import SimpleFrozenDict, ensure_path, registry
 from .strings import get_string_id
 UNSET = object()
@registry.language_data("spacy-lookups-data")
 def get_lookups(lang: str) -> Dict[str, Any]:
    """Load the data from the spacy-lookups-data package for a given language,
    if available. Returns an empty dict if there's no data or if the package
    is not installed.
    lang (str): The language code (corresponds to entry point exposed by
        the spacy-lookups-data package).
    RETURNS (Dict[str, Any]): The lookups, keyed by table name.
    """
    if lang in registry.lookups:
        return registry.lookups.get(lang)
    return {}
 class Lookups:
    """Container for large lookup tables and dictionaries, e.g. lemmatization
    data or tokenizer exception lists. Lookups are available via vocab.lookups,
@ -18,7 +35,7 @@ class Lookups:
    via doc.vocab.lookups.
    """
-    def __init__(self):
+    def __init__(self) -> None:
        """Initialize the Lookups object.
        RETURNS (Lookups): The newly created object.
@ -27,7 +44,7 @@ class Lookups:
        """
        self._tables = {}
-    def __contains__(self, name):
+    def __contains__(self, name: str) -> bool:
        """Check if the lookups contain a table of a given name. Delegates to
        Lookups.has_table.
@ -36,16 +53,16 @@ class Lookups:
        """
        return self.has_table(name)
-    def __len__(self):
+    def __len__(self) -> int:
        """RETURNS (int): The number of tables in the lookups."""
        return len(self._tables)
    @property
-    def tables(self):
+    def tables(self) -> List[str]:
-        """RETURNS (list): Names of all tables in the lookups."""
+        """RETURNS (List[str]): Names of all tables in the lookups."""
        return list(self._tables.keys())
-    def add_table(self, name, data=SimpleFrozenDict()):
+    def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table":
        """Add a new table to the lookups. Raises an error if the table exists.
        name (str): Unique name of table.
@ -60,12 +77,12 @@ class Lookups:
        self._tables[name] = table
        return table
-    def get_table(self, name, default=UNSET):
+    def get_table(self, name: str, default: Any = UNSET) -> "Table":
        """Get a table. Raises an error if the table doesn't exist and no
        default value is provided.
        name (str): Name of the table.
-        default: Optional default value to return if table doesn't exist.
+        default (Any): Optional default value to return if table doesn't exist.
        RETURNS (Table): The table.
        DOCS: https://spacy.io/api/lookups#get_table
@ -76,7 +93,7 @@ class Lookups:
            return default
        return self._tables[name]
-    def remove_table(self, name):
+    def remove_table(self, name: str) -> "Table":
        """Remove a table. Raises an error if the table doesn't exist.
        name (str): Name of the table to remove.
@ -88,7 +105,7 @@ class Lookups:
            raise KeyError(Errors.E159.format(name=name, tables=self.tables))
        return self._tables.pop(name)
-    def has_table(self, name):
+    def has_table(self, name: str) -> bool:
        """Check if the lookups contain a table of a given name.
        name (str): Name of the table.
@ -98,7 +115,7 @@ class Lookups:
        """
        return name in self._tables
-    def to_bytes(self, **kwargs):
+    def to_bytes(self, **kwargs) -> bytes:
        """Serialize the lookups to a bytestring.
        RETURNS (bytes): The serialized Lookups.
@ -107,7 +124,7 @@ class Lookups:
        """
        return srsly.msgpack_dumps(self._tables)
-    def from_bytes(self, bytes_data, **kwargs):
+    def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
        """Load the lookups from a bytestring.
        bytes_data (bytes): The data to load.
@ -120,7 +137,9 @@ class Lookups:
            self._tables[key] = Table(key, value)
        return self
-    def to_disk(self, path, filename="lookups.bin", **kwargs):
+    def to_disk(
        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
    ) -> None:
        """Save the lookups to a directory as lookups.bin. Expects a path to a
        directory, which will be created if it doesn't exist.
@ -136,7 +155,9 @@ class Lookups:
            with filepath.open("wb") as file_:
                file_.write(self.to_bytes())
-    def from_disk(self, path, filename="lookups.bin", **kwargs):
+    def from_disk(
        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
    ) -> "Lookups":
        """Load lookups from a directory containing a lookups.bin. Will skip
        loading if the file doesn't exist.
@ -162,7 +183,7 @@ class Table(OrderedDict):
    """
    @classmethod
-    def from_dict(cls, data, name=None):
+    def from_dict(cls, data: dict, name: Optional[str] = None) -> "Table":
        """Initialize a new table from a dict.
        data (dict): The dictionary.
@ -175,7 +196,7 @@ class Table(OrderedDict):
        self.update(data)
        return self
-    def __init__(self, name=None, data=None):
+    def __init__(self, name: Optional[str] = None, data: Optional[dict] = None) -> None:
        """Initialize a new table.
        name (str): Optional table name for reference.
@ -193,7 +214,7 @@ class Table(OrderedDict):
        if data:
            self.update(data)
-    def __setitem__(self, key, value):
+    def __setitem__(self, key: Union[str, int], value: Any) -> None:
        """Set new key/value pair. String keys will be hashed.
        key (str / int): The key to set.
@ -203,7 +224,7 @@ class Table(OrderedDict):
        OrderedDict.__setitem__(self, key, value)
        self.bloom.add(key)
-    def set(self, key, value):
+    def set(self, key: Union[str, int], value: Any) -> None:
        """Set new key/value pair. String keys will be hashed.
        Same as table[key] = value.
@ -212,7 +233,7 @@ class Table(OrderedDict):
        """
        self[key] = value
-    def __getitem__(self, key):
+    def __getitem__(self, key: Union[str, int]) -> Any:
        """Get the value for a given key. String keys will be hashed.
        key (str / int): The key to get.
@ -221,7 +242,7 @@ class Table(OrderedDict):
        key = get_string_id(key)
        return OrderedDict.__getitem__(self, key)
-    def get(self, key, default=None):
+    def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
        """Get the value for a given key. String keys will be hashed.
        key (str / int): The key to get.
@ -231,7 +252,7 @@ class Table(OrderedDict):
        key = get_string_id(key)
        return OrderedDict.get(self, key, default)
-    def __contains__(self, key):
+    def __contains__(self, key: Union[str, int]) -> bool:
        """Check whether a key is in the table. String keys will be hashed.
        key (str / int): The key to check.
@ -243,7 +264,7 @@ class Table(OrderedDict):
            return False
        return OrderedDict.__contains__(self, key)
-    def to_bytes(self):
+    def to_bytes(self) -> bytes:
        """Serialize table to a bytestring.
        RETURNS (bytes): The serialized table.
@ -257,7 +278,7 @@ class Table(OrderedDict):
        }
        return srsly.msgpack_dumps(data)
-    def from_bytes(self, bytes_data):
+    def from_bytes(self, bytes_data: bytes) -> "Table":
        """Load a table from a bytestring.
        bytes_data (bytes): The data to load.
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -26,8 +26,8 @@ def build_nel_encoder(tok2vec, nO=None):
@registry.assets.register("spacy.KBFromFile.v1")
-def load_kb(nlp_path, kb_path) -> KnowledgeBase:
+def load_kb(vocab_path, kb_path) -> KnowledgeBase:
-    vocab = Vocab().from_disk(Path(nlp_path) / "vocab")
+    vocab = Vocab().from_disk(vocab_path)
    kb = KnowledgeBase(vocab=vocab)
    kb.load_bulk(kb_path)
    return kb
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -1,30 +1,9 @@
-from thinc.api import (
+from typing import Optional
-    Model,
+from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
-    reduce_mean,
+from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
-    Linear,
+from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
-    list2ragged,
+from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued
-    Logistic,
+from thinc.api import Relu, residual, expand_window, FeatureExtractor
    ParametricAttention,
 )
 from thinc.api import chain, concatenate, clone, Dropout
 from thinc.api import (
    SparseLinear,
    Softmax,
    softmax_activation,
    Maxout,
    reduce_sum,
    Relu,
    residual,
    expand_window,
 )
 from thinc.api import (
    HashEmbed,
    with_ragged,
    with_array,
    with_cpu,
    uniqued,
    FeatureExtractor,
 )
 from ..spacy_vectors import SpacyVectors
 from ... import util
@ -34,7 +13,9 @@ from ..extract_ngrams import extract_ngrams
@registry.architectures.register("spacy.TextCatCNN.v1")
-def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
+def build_simple_cnn_text_classifier(
    tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
 ) -> Model:
    """
    Build a simple CNN text classifier, given a token-to-vector model as inputs.
    If exclusive_classes=True, a softmax non-linearity is applied, so that the
@ -90,13 +71,25 @@ def build_text_classifier(
            nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
        )
        prefix = HashEmbed(
-            nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11
+            nO=width // 2,
            nV=embed_size,
            column=cols.index(PREFIX),
            dropout=dropout,
            seed=11,
        )
        suffix = HashEmbed(
-            nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12
+            nO=width // 2,
            nV=embed_size,
            column=cols.index(SUFFIX),
            dropout=dropout,
            seed=12,
        )
        shape = HashEmbed(
-            nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13
+            nO=width // 2,
            nV=embed_size,
            column=cols.index(SHAPE),
            dropout=dropout,
            seed=13,
        )
        width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -11,8 +11,8 @@ from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
@registry.architectures.register("spacy.Tok2VecTensors.v1")
-def tok2vec_tensors_v1(width):
+def tok2vec_tensors_v1(width, upstream="*"):
-    tok2vec = Tok2VecListener("tok2vec", width=width)
+    tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
    return tok2vec
--- a/spacy/pipe_analysis.py
+++ b/spacy/pipe_analysis.py
@ -1,30 +1,37 @@
 from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
 from wasabi import Printer
 import warnings
 from .tokens import Doc, Token, Span
 from .errors import Errors, Warnings
 from .util import dot_to_dict
 if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
    from .language import Language  # noqa: F401
-def analyze_pipes(pipeline, name, pipe, index, warn=True):
+def analyze_pipes(
    nlp: "Language", name: str, index: int, warn: bool = True
 ) -> List[str]:
    """Analyze a pipeline component with respect to its position in the current
    pipeline and the other components. Will check whether requirements are
    fulfilled (e.g. if previous components assign the attributes).
-    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    nlp (Language): The current nlp object.
    name (str): The name of the pipeline component to analyze.
    pipe (callable): The pipeline component function to analyze.
    index (int): The index of the component in the pipeline.
    warn (bool): Show user warning if problem is found.
-    RETURNS (list): The problems found for the given pipeline component.
+    RETURNS (List[str]): The problems found for the given pipeline component.
    """
-    assert pipeline[index][0] == name
+    assert nlp.pipeline[index][0] == name
-    prev_pipes = pipeline[:index]
+    prev_pipes = nlp.pipeline[:index]
-    pipe_requires = getattr(pipe, "requires", [])
+    meta = nlp.get_pipe_meta(name)
-    requires = {annot: False for annot in pipe_requires}
+    requires = {annot: False for annot in meta.requires}
    if requires:
        for prev_name, prev_pipe in prev_pipes:
-            prev_assigns = getattr(prev_pipe, "assigns", [])
+            prev_meta = nlp.get_pipe_meta(prev_name)
-            for annot in prev_assigns:
+            for annot in prev_meta.assigns:
                requires[annot] = True
    problems = []
    for annot, fulfilled in requires.items():
@ -35,46 +42,29 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
    return problems
-def analyze_all_pipes(pipeline, warn=True):
+def analyze_all_pipes(nlp: "Language", warn: bool = True) -> Dict[str, List[str]]:
    """Analyze all pipes in the pipeline in order.
-    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    nlp (Language): The current nlp object.
    warn (bool): Show user warning if problem is found.
-    RETURNS (dict): The problems found, keyed by component name.
+    RETURNS (Dict[str, List[str]]): The problems found, keyed by component name.
    """
    problems = {}
-    for i, (name, pipe) in enumerate(pipeline):
+    for i, name in enumerate(nlp.pipe_names):
-        problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
+        problems[name] = analyze_pipes(nlp, name, i, warn=warn)
    return problems
-def dot_to_dict(values):
+def validate_attrs(values: Iterable[str]) -> Iterable[str]:
    """Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
    become {"token": {"pos": True, "_": {"xyz": True }}}.
    values (iterable): The values to convert.
    RETURNS (dict): The converted values.
    """
    result = {}
    for value in values:
        path = result
        parts = value.lower().split(".")
        for i, item in enumerate(parts):
            is_last = i == len(parts) - 1
            path = path.setdefault(item, True if is_last else {})
    return result
 def validate_attrs(values):
    """Validate component attributes provided to "assigns", "requires" etc.
    Raises error for invalid attributes and formatting. Doesn't check if
    custom extension attributes are registered, since this is something the
    user might want to do themselves later in the component.
-    values (iterable): The string attributes to check, e.g. `["token.pos"]`.
+    values (Iterable[str]): The string attributes to check, e.g. `["token.pos"]`.
-    RETURNS (iterable): The checked attributes.
+    RETURNS (Iterable[str]): The checked attributes.
    """
-    data = dot_to_dict(values)
+    data = dot_to_dict({value: True for value in values})
    objs = {"doc": Doc, "token": Token, "span": Span}
    for obj_key, attrs in data.items():
        if obj_key == "span":
@ -111,37 +101,40 @@ def validate_attrs(values):
    return values
-def _get_feature_for_attr(pipeline, attr, feature):
+def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]:
    assert feature in ["assigns", "requires"]
    result = []
-    for pipe_name, pipe in pipeline:
+    for pipe_name in nlp.pipe_names:
-        pipe_assigns = getattr(pipe, feature, [])
+        meta = nlp.get_pipe_meta(pipe_name)
        pipe_assigns = getattr(meta, feature, [])
        if attr in pipe_assigns:
-            result.append((pipe_name, pipe))
+            result.append(pipe_name)
    return result
-def get_assigns_for_attr(pipeline, attr):
+def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]:
    """Get all pipeline components that assign an attr, e.g. "doc.tensor".
-    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    pipeline (Language): The current nlp object.
    attr (str): The attribute to check.
-    RETURNS (list): (name, pipeline) tuples of components that assign the attr.
+    RETURNS (List[str]): Names of components that require the attr.
    """
-    return _get_feature_for_attr(pipeline, attr, "assigns")
+    return _get_feature_for_attr(nlp, attr, "assigns")
-def get_requires_for_attr(pipeline, attr):
+def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]:
    """Get all pipeline components that require an attr, e.g. "doc.tensor".
-    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    pipeline (Language): The current nlp object.
    attr (str): The attribute to check.
-    RETURNS (list): (name, pipeline) tuples of components that require the attr.
+    RETURNS (List[str]): Names of components that require the attr.
    """
-    return _get_feature_for_attr(pipeline, attr, "requires")
+    return _get_feature_for_attr(nlp, attr, "requires")
-def print_summary(nlp, pretty=True, no_print=False):
+def print_summary(
    nlp: "Language", pretty: bool = True, no_print: bool = False
 ) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
    """Print a formatted summary for the current nlp object's pipeline. Shows
    a table with the pipeline components and why they assign and require, as
    well as any problems if available.
@ -154,12 +147,10 @@ def print_summary(nlp, pretty=True, no_print=False):
    msg = Printer(pretty=pretty, no_print=no_print)
    overview = []
    problems = {}
-    for i, (name, pipe) in enumerate(nlp.pipeline):
+    for i, name in enumerate(nlp.pipe_names):
-        requires = getattr(pipe, "requires", [])
+        meta = nlp.get_pipe_meta(name)
-        assigns = getattr(pipe, "assigns", [])
+        overview.append((i, name, meta.requires, meta.assigns, meta.retokenizes))
-        retok = getattr(pipe, "retokenizes", False)
+        problems[name] = analyze_pipes(nlp, name, i, warn=False)
        overview.append((i, name, requires, assigns, retok))
        problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
    msg.divider("Pipeline Overview")
    header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
    msg.table(overview, header=header, divider=True, multiline=True)
@ -175,15 +166,19 @@ def print_summary(nlp, pretty=True, no_print=False):
        return {"overview": overview, "problems": problems}
-def count_pipeline_interdependencies(pipeline):
+def count_pipeline_interdependencies(nlp: "Language") -> List[int]:
    """Count how many subsequent components require an annotation set by each
    component in the pipeline.
    nlp (Language): The current nlp object.
    RETURNS (List[int]): The interdependency counts.
    """
    pipe_assigns = []
    pipe_requires = []
-    for name, pipe in pipeline:
+    for name in nlp.pipe_names:
-        pipe_assigns.append(set(getattr(pipe, "assigns", [])))
+        meta = nlp.get_pipe_meta(name)
-        pipe_requires.append(set(getattr(pipe, "requires", [])))
+        pipe_assigns.append(set(meta.assigns))
        pipe_requires.append(set(meta.requires))
    counts = []
    for i, assigns in enumerate(pipe_assigns):
        count = 0
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -1,28 +1,33 @@
-from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
+from .dep_parser import DependencyParser
-from .pipes import TextCategorizer, Pipe, Sentencizer
+from .entity_linker import EntityLinker
-from .pipes import SentenceRecognizer
+from .ner import EntityRecognizer
 from .simple_ner import SimpleNER
 from .morphologizer import Morphologizer
 from .entityruler import EntityRuler
 from .morphologizer import Morphologizer
 from .pipe import Pipe
 from spacy.pipeline.senter import SentenceRecognizer
 from .sentencizer import Sentencizer
 from .simple_ner import SimpleNER
 from .tagger import Tagger
 from .textcat import TextCategorizer
 from .tok2vec import Tok2Vec
 from .hooks import SentenceSegmenter, SimilarityHook
 from .functions import merge_entities, merge_noun_chunks, merge_subtokens
 __all__ = [
    "Tagger",
    "DependencyParser",
    "EntityRecognizer",
    "EntityLinker",
-    "TextCategorizer",
+    "EntityRecognizer",
    "Tok2Vec",
    "Pipe",
    "Morphologizer",
    "EntityRuler",
-    "Sentencizer",
+    "Morphologizer",
-    "SentenceSegmenter",
+    "Pipe",
    "SentenceRecognizer",
    "SentenceSegmenter",
    "Sentencizer",
    "SimilarityHook",
    "SimpleNER",
    "Tagger",
    "TextCategorizer",
    "Tok2Vec",
    "merge_entities",
    "merge_noun_chunks",
    "merge_subtokens",
--- a/spacy/pipeline/defaults/init.py
+++ b/spacy/pipeline/defaults/init.py
@ -1,93 +0,0 @@
 from pathlib import Path
 from ... import util
 def default_nel_config():
    loc = Path(__file__).parent / "entity_linker_defaults.cfg"
    return util.load_config(loc, create_objects=False)
 def default_nel():
    loc = Path(__file__).parent / "entity_linker_defaults.cfg"
    return util.load_config(loc, create_objects=True)["model"]
 def default_morphologizer_config():
    loc = Path(__file__).parent / "morphologizer_defaults.cfg"
    return util.load_config(loc, create_objects=False)
 def default_morphologizer():
    loc = Path(__file__).parent / "morphologizer_defaults.cfg"
    return util.load_config(loc, create_objects=True)["model"]
 def default_parser_config():
    loc = Path(__file__).parent / "parser_defaults.cfg"
    return util.load_config(loc, create_objects=False)
 def default_parser():
    loc = Path(__file__).parent / "parser_defaults.cfg"
    return util.load_config(loc, create_objects=True)["model"]
 def default_ner_config():
    loc = Path(__file__).parent / "ner_defaults.cfg"
    return util.load_config(loc, create_objects=False)
 def default_ner():
    loc = Path(__file__).parent / "ner_defaults.cfg"
    return util.load_config(loc, create_objects=True)["model"]
 def default_senter_config():
    loc = Path(__file__).parent / "senter_defaults.cfg"
    return util.load_config(loc, create_objects=False)
 def default_senter():
    loc = Path(__file__).parent / "senter_defaults.cfg"
    return util.load_config(loc, create_objects=True)["model"]
 def default_tagger_config():
    loc = Path(__file__).parent / "tagger_defaults.cfg"
    return util.load_config(loc, create_objects=False)
 def default_tagger():
    loc = Path(__file__).parent / "tagger_defaults.cfg"
    return util.load_config(loc, create_objects=True)["model"]
 def default_textcat_config():
    loc = Path(__file__).parent / "textcat_defaults.cfg"
    return util.load_config(loc, create_objects=False)
 def default_textcat():
    loc = Path(__file__).parent / "textcat_defaults.cfg"
    return util.load_config(loc, create_objects=True)["model"]
 def default_tok2vec_config():
    loc = Path(__file__).parent / "tok2vec_defaults.cfg"
    return util.load_config(loc, create_objects=False)
 def default_tok2vec():
    loc = Path(__file__).parent / "tok2vec_defaults.cfg"
    return util.load_config(loc, create_objects=True)["model"]
 def default_simple_ner_config():
    loc = Path(__file__).parent / "simple_ner_defaults.cfg"
    return util.load_config(loc, create_objects=False)
 def default_simple_ner():
    loc = Path(__file__).parent / "simple_ner_defaults.cfg"
    return util.load_config(loc, create_objects=True)["model"]
--- a/spacy/pipeline/defaults/entity_linker_defaults.cfg
+++ b/spacy/pipeline/defaults/entity_linker_defaults.cfg
@ -1,13 +0,0 @@
 [model]
@architectures = "spacy.EntityLinker.v1"
 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
 pretrained_vectors = null
 width = 96
 depth = 2
 embed_size = 300
 window_size = 1
 maxout_pieces = 3
 subword_features = true
 dropout = null
--- a/spacy/pipeline/defaults/morphologizer_defaults.cfg
+++ b/spacy/pipeline/defaults/morphologizer_defaults.cfg
@ -1,14 +0,0 @@
 [model]
@architectures = "spacy.Tagger.v1"
 [model.tok2vec]
@architectures = "spacy.HashCharEmbedCNN.v1"
 pretrained_vectors = null
 width = 128
 depth = 4
 embed_size = 7000
 window_size = 1
 maxout_pieces = 3
 nM = 64
 nC = 8
 dropout = null
--- a/spacy/pipeline/defaults/multitask_defaults.cfg
+++ b/spacy/pipeline/defaults/multitask_defaults.cfg
@ -1,15 +0,0 @@
 [model]
@architectures = "spacy.MultiTask.v1"
 maxout_pieces = 3
 token_vector_width = 96
 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
 pretrained_vectors = null
 width = 96
 depth = 4
 embed_size = 2000
 window_size = 1
 maxout_pieces = 2
 subword_features = true
 dropout = null
--- a/spacy/pipeline/defaults/ner_defaults.cfg
+++ b/spacy/pipeline/defaults/ner_defaults.cfg
@ -1,16 +0,0 @@
 [model]
@architectures = "spacy.TransitionBasedParser.v1"
 nr_feature_tokens = 6
 hidden_width = 64
 maxout_pieces = 2
 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
 pretrained_vectors = null
 width = 96
 depth = 4
 embed_size = 2000
 window_size = 1
 maxout_pieces = 3
 subword_features = true
 dropout = null
--- a/Show More
+++ b/Show More