From 945f795a3e4ebb0bab6e4c0420ec1dc590437422 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 22 Jul 2020 15:59:37 +0200
Subject: [PATCH] WIP: move more language data to config

---
 spacy/cli/init_model.py        |  7 ++---
 spacy/cli/train.py             |  7 ++---
 spacy/default_config.cfg       |  2 ++
 spacy/errors.py                |  5 +---
 spacy/gold/augment.py          |  5 ++--
 spacy/lang/bn/__init__.py      |  3 +-
 spacy/lang/ca/__init__.py      |  3 +-
 spacy/lang/da/__init__.py      |  8 +++++-
 spacy/lang/de/__init__.py      | 22 +++++----------
 spacy/lang/el/__init__.py      | 12 ++++++--
 spacy/lang/en/__init__.py      | 20 ++++++--------
 spacy/lang/es/__init__.py      |  8 +++++-
 spacy/lang/fa/__init__.py      |  3 +-
 spacy/lang/fr/__init__.py      |  7 +++--
 spacy/lang/hr/__init__.py      |  3 +-
 spacy/lang/hu/__init__.py      |  3 +-
 spacy/lang/id/__init__.py      |  8 +++++-
 spacy/lang/it/__init__.py      |  3 +-
 spacy/lang/lb/__init__.py      |  8 +++++-
 spacy/lang/lt/__init__.py      |  3 +-
 spacy/lang/nb/__init__.py      |  3 +-
 spacy/lang/nl/__init__.py      |  7 +++--
 spacy/lang/pl/__init__.py      |  7 +++--
 spacy/lang/pt/__init__.py      |  3 +-
 spacy/lang/ro/__init__.py      |  3 +-
 spacy/lang/ru/__init__.py      |  5 ++++
 spacy/lang/sr/__init__.py      |  8 +++++-
 spacy/lang/sv/__init__.py      |  3 +-
 spacy/lang/ta/__init__.py      |  5 ++++
 spacy/lang/th/__init__.py      |  5 ++++
 spacy/lang/tl/__init__.py      |  3 +-
 spacy/lang/tr/__init__.py      |  3 +-
 spacy/lang/ur/__init__.py      |  3 +-
 spacy/language.py              |  4 +--
 spacy/lemmatizer.py            | 19 +++++--------
 spacy/lexeme.pyx               | 10 +++----
 spacy/lookups.py               | 18 ++++++++----
 spacy/schemas.py               |  1 +
 spacy/tests/test_lemmatizer.py |  7 ++++-
 spacy/vocab.pxd                |  1 -
 spacy/vocab.pyx                | 50 +++++++++-------------------------
 41 files changed, 174 insertions(+), 134 deletions(-)

diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 9fb346006..f0c80bb8c 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -112,10 +112,9 @@ def init_model(
     # Create empty extra lexeme tables so the data from spacy-lookups-data
     # isn't loaded if these features are accessed
     if omit_extra_lookups:
-        nlp.vocab.lookups_extra = Lookups()
-        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
-        nlp.vocab.lookups_extra.add_table("lexeme_prob")
-        nlp.vocab.lookups_extra.add_table("lexeme_settings")
+        nlp.vocab.lookups.remove_table("lexeme_cluster")
+        nlp.vocab.lookups.remove_table("lexeme_prob")
+        nlp.vocab.lookups.remove_table("lexeme_settings")
 
     msg.good("Successfully created model")
     if vectors_loc is not None:
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 6ff665368..310580dbb 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -123,10 +123,9 @@ def train(
     # Create empty extra lexeme tables so the data from spacy-lookups-data
     # isn't loaded if these features are accessed
     if config["training"]["omit_extra_lookups"]:
-        nlp.vocab.lookups_extra = Lookups()
-        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
-        nlp.vocab.lookups_extra.add_table("lexeme_prob")
-        nlp.vocab.lookups_extra.add_table("lexeme_settings")
+        nlp.vocab.lookups.remove_table("lexeme_cluster")
+        nlp.vocab.lookups.remove_table("lexeme_prob")
+        nlp.vocab.lookups.remove_table("lexeme_settings")
 
     # Load a pretrained tok2vec model - cf. CLI command 'pretrain'
     if weights_data is not None:
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 7e6c7a6ec..747194cb4 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -2,6 +2,7 @@
 lang = null
 stop_words = []
 lex_attr_getters = {}
+vocab_data = {}
 pipeline = []
 
 [nlp.tokenizer]
@@ -9,6 +10,7 @@ pipeline = []
 
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
+data = {}
 
 [nlp.writing_system]
 direction = "ltr"
diff --git a/spacy/errors.py b/spacy/errors.py
index f6c7a569f..719e0204b 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -434,9 +434,6 @@ class Errors:
     E170 = ("Cannot apply transition {name}: invalid for the current state.")
     E171 = ("Matcher.add received invalid on_match callback argument: expected "
             "callable or None, but got: {arg_type}")
-    E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
-            "Lemmatizer, initialize the class directly. See the docs for "
-            "details: https://spacy.io/api/lemmatizer")
     E175 = ("Can't remove rule for unknown match pattern ID: {key}")
     E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
     E177 = ("Ill-formed IOB input detected: {tag}")
@@ -601,7 +598,7 @@ class Errors:
             "the same `Vocab`.")
     E1000 = ("No pkuseg model available. Provide a pkuseg model when "
              "initializing the pipeline:\n"
-             'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m'
+             'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n'
              'nlp = Chinese(config=cfg)')
 
 
diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py
index 45cfc0abe..790762617 100644
--- a/spacy/gold/augment.py
+++ b/spacy/gold/augment.py
@@ -25,8 +25,9 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
         lower = True
         if raw is not None:
             raw = raw.lower()
-    ndsv = nlp.Defaults.single_orth_variants
-    ndpv = nlp.Defaults.paired_orth_variants
+    orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
+    ndsv = orth_variants.get("single", [])
+    ndpv = orth_variants.get("pairsed", [])
     words = token_dict.get("words", [])
     tags = token_dict.get("tags", [])
     # keep unmodified if words or tags are not defined
diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index 2ac771537..4b80e0c41 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.bn.stop_words"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_rules"]
 """
 
 
diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py
index d2924e902..cab47555d 100644
--- a/spacy/lang/ca/__init__.py
+++ b/spacy/lang/ca/__init__.py
@@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
 """
 
 
diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py
index 82ed5ed34..4f3802b21 100644
--- a/spacy/lang/da/__init__.py
+++ b/spacy/lang/da/__init__.py
@@ -19,9 +19,15 @@ lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
+
+[nlp.vocab_data]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+tables = ["lexeme_norm"]
 """
 
 
diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py
index a5c38bd39..d620ded58 100644
--- a/spacy/lang/de/__init__.py
+++ b/spacy/lang/de/__init__.py
@@ -19,9 +19,15 @@ stop_words = {"@language_data": "spacy.de.stop_words"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
+
+[nlp.vocab_data]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
 """
 
 
@@ -36,20 +42,6 @@ class GermanDefaults(Language.Defaults):
     suffixes = TOKENIZER_SUFFIXES
     infixes = TOKENIZER_INFIXES
     syntax_iterators = SYNTAX_ITERATORS
-    single_orth_variants = [
-        {"tags": ["$("], "variants": ["…", "..."]},
-        {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]},
-    ]
-    paired_orth_variants = [
-        {
-            "tags": ["$("],
-            "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")],
-        },
-        {
-            "tags": ["$("],
-            "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")],
-        },
-    ]
 
 
 class German(Language):
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index 2fd8647fb..65c634340 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -21,15 +21,21 @@ lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.GreekLemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_index", "lemma_exc", "lemma_rules"]
+
+[nlp.vocab_data]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+tables = ["lexeme_norm", "lexeme_prob", "lexeme_settings"]
 """
 
 
 @registry.lemmatizers("spacy.GreekLemmatizer.v1")
-def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer:
-    return GreekLemmatizer(data_paths=data_paths)
+def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer:
+    return GreekLemmatizer(data=data)
 
 
 @registry.language_data("spacy.el.stop_words")
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index 4a69b2a41..3e21cf21b 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -22,9 +22,15 @@ lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.EnglishLemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
+
+[nlp.vocab_data]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+tables = ["lexeme_norm", "lexeme_cluster", "lexeme_prob", "lexeme_settings", "orth_variants"]
 """
 
 
@@ -39,22 +45,14 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 
 
 @registry.lemmatizers("spacy.EnglishLemmatizer.v1")
-def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
-    return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form)
+def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer":
+    return Lemmatizer(data=data, is_base_form=is_base_form)
 
 
 class EnglishDefaults(Language.Defaults):
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
     syntax_iterators = SYNTAX_ITERATORS
     infixes = TOKENIZER_INFIXES
-    single_orth_variants = [
-        {"tags": ["NFP"], "variants": ["…", "..."]},
-        {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
-    ]
-    paired_orth_variants = [
-        {"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
-        {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
-    ]
 
 
 class English(Language):
diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py
index 4425bfc01..52aef4521 100644
--- a/spacy/lang/es/__init__.py
+++ b/spacy/lang/es/__init__.py
@@ -20,9 +20,15 @@ lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
+
+[nlp.vocab_data]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"]
 """
 
 
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index 085f400a4..41e40ca30 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -24,9 +24,10 @@ has_letters = true
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_rules", "lemma_index", "lemma_exc"]
 """
 
 
diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index 8140a21b6..4ec30cbd9 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -22,15 +22,16 @@ lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.FrenchLemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
 """
 
 
 @registry.lemmatizers("spacy.FrenchLemmatizer.v1")
-def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer:
-    return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form)
+def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer:
+    return FrenchLemmatizer(data=data, is_base_form=is_base_form)
 
 
 @registry.language_data("spacy.fr.stop_words")
diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py
index 648186093..e841ee24d 100644
--- a/spacy/lang/hr/__init__.py
+++ b/spacy/lang/hr/__init__.py
@@ -15,9 +15,10 @@ stop_words = {"@language_data": "spacy.hr.stop_words"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
 """
 
 
diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py
index 3e83e971a..2cfd61dfa 100644
--- a/spacy/lang/hu/__init__.py
+++ b/spacy/lang/hu/__init__.py
@@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.hu.stop_words"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
 """
 
 
diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py
index b8b34aa26..8998addb4 100644
--- a/spacy/lang/id/__init__.py
+++ b/spacy/lang/id/__init__.py
@@ -20,9 +20,15 @@ lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
+
+[nlp.vocab_data]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+tables = ["lexeme_norm"]
 """
 
 
diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
index 1b0a15348..f6b6afa59 100644
--- a/spacy/lang/it/__init__.py
+++ b/spacy/lang/it/__init__.py
@@ -17,9 +17,10 @@ stop_words = {"@language_data": "spacy.it.stop_words"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
 """
 
 
diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py
index 54e4e82c0..d381bb2e7 100644
--- a/spacy/lang/lb/__init__.py
+++ b/spacy/lang/lb/__init__.py
@@ -19,9 +19,15 @@ lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
+
+[nlp.vocab_data]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+tables = ["lexeme_norm"]
 """
 
 
diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py
index 656df79c9..23c11f3a1 100644
--- a/spacy/lang/lt/__init__.py
+++ b/spacy/lang/lt/__init__.py
@@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
 """
 
 
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index e472b0c60..3b386344b 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -19,9 +19,10 @@ stop_words = {"@language_data": "spacy.nb.stop_words"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup", "lemma_rules", "lemma_exc"]
 """
 
 
diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py
index 7e9806bc3..ab2cf3a94 100644
--- a/spacy/lang/nl/__init__.py
+++ b/spacy/lang/nl/__init__.py
@@ -21,9 +21,10 @@ lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.DutchLemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
 """
 
 
@@ -38,8 +39,8 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 
 
 @registry.lemmatizers("spacy.DutchLemmatizer.v1")
-def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer:
-    return DutchLemmatizer(data_paths=data_paths)
+def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer:
+    return DutchLemmatizer(data=data)
 
 
 class DutchDefaults(Language.Defaults):
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 87a174ec8..82957dc7a 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -20,9 +20,10 @@ lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.PolishLemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"]
 """
 
 
@@ -37,8 +38,8 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 
 
 @registry.lemmatizers("spacy.PolishLemmatizer.v1")
-def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer:
-    return PolishLemmatizer(data_paths=data_paths)
+def create_polish_lemmatizer(data: Dict[str, dict] = {}) -> PolishLemmatizer:
+    return PolishLemmatizer(data=data)
 
 
 class PolishDefaults(Language.Defaults):
diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py
index 6dc22ed61..045bd3bc1 100644
--- a/spacy/lang/pt/__init__.py
+++ b/spacy/lang/pt/__init__.py
@@ -19,9 +19,10 @@ lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
 """
 
 
diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py
index b66b7767c..740bd7911 100644
--- a/spacy/lang/ro/__init__.py
+++ b/spacy/lang/ro/__init__.py
@@ -22,9 +22,10 @@ stop_words = {"@language_data": "spacy.ro.stop_words"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
 """
 
 
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 004a8d83a..e9e28dfb5 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -18,6 +18,11 @@ lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
 
 [nlp.lemmatizer]
 @lemmatizers = "spacy.RussianLemmatizer.v1"
+
+[nlp.vocab_data]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+tables = ["lexeme_norm"]
 """
 
 
diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py
index fd53d3826..f69ad3a89 100644
--- a/spacy/lang/sr/__init__.py
+++ b/spacy/lang/sr/__init__.py
@@ -18,9 +18,15 @@ lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
+
+[nlp.vocab_data]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+tables = ["lexeme_norm"]
 """
 
 
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 5c376fd51..c18ad775d 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -22,9 +22,10 @@ lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup", "lemma_rules"]
 """
 
 
diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py
index 983bd5de4..c429127c9 100644
--- a/spacy/lang/ta/__init__.py
+++ b/spacy/lang/ta/__init__.py
@@ -12,6 +12,11 @@ DEFAULT_CONFIG = """
 lang = "ta"
 stop_words = {"@language_data": "spacy.ta.stop_words"}
 lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
+
+[nlp.vocab_data]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+tables = ["lexeme_norm"]
 """
 
 
diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index 116355342..1fdf4311e 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -16,6 +16,11 @@ lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
 
 [nlp.tokenizer]
 @tokenizers = "spacy.ThaiTokenizer.v1"
+
+[nlp.vocab_data]
+@language_data = "spacy-lookups-data"
+lang = ${nlp:lang}
+tables = ["lexeme_norm"]
 """
 
 
diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py
index c52adb046..a7158e6f6 100644
--- a/spacy/lang/tl/__init__.py
+++ b/spacy/lang/tl/__init__.py
@@ -18,9 +18,10 @@ lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
 """
 
 
diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py
index f6782b419..dff56e945 100644
--- a/spacy/lang/tr/__init__.py
+++ b/spacy/lang/tr/__init__.py
@@ -16,9 +16,10 @@ stop_words = {"@language_data": "spacy.tr.stop_words"}
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
 """
 
 
diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py
index c7977d6b8..db714c296 100644
--- a/spacy/lang/ur/__init__.py
+++ b/spacy/lang/ur/__init__.py
@@ -23,9 +23,10 @@ has_letters = true
 [nlp.lemmatizer]
 @lemmatizers = "spacy.Lemmatizer.v1"
 
-[nlp.lemmatizer.data_paths]
+[nlp.lemmatizer.data]
 @language_data = "spacy-lookups-data"
 lang = ${nlp:lang}
+tables = ["lemma_lookup"]
 """
 
 
diff --git a/spacy/language.py b/spacy/language.py
index 97c8f31b7..77d0b4b0e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -55,8 +55,6 @@ class BaseDefaults:
     tokenizer_exceptions: Dict[str, List[dict]] = {}
     morph_rules: Dict[str, Dict[str, dict]] = {}
     syntax_iterators: Dict[str, Callable[[Union[Doc, Span]], Iterator]] = {}
-    single_orth_variants: List[Dict[str, List[str]]] = []
-    paired_orth_variants: List[Dict[str, Union[List[str], List[Tuple[str, str]]]]] = []
 
 
 class Language:
@@ -1268,11 +1266,13 @@ class Language:
         lemmatizer = resolved["nlp"]["lemmatizer"]
         lex_attr_getters = resolved["nlp"]["lex_attr_getters"]
         stop_words = resolved["nlp"]["stop_words"]
+        vocab_data = resolved["nlp"]["vocab_data"]
         vocab = Vocab.from_config(
             filled,
             lemmatizer=lemmatizer,
             lex_attr_getters=lex_attr_getters,
             stop_words=stop_words,
+            vocab_data=vocab_data,
             # TODO: what should we do with these?
             tag_map=cls.Defaults.tag_map,
             morph_rules=cls.Defaults.morph_rules,
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index 81dbf4ea3..8255b4b36 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -1,14 +1,13 @@
 from typing import Optional, Callable, List, Dict
 
 from .lookups import Lookups
-from .errors import Errors
 from .parts_of_speech import NAMES as UPOS_NAMES
-from .util import registry, load_language_data, SimpleFrozenDict
+from .util import registry
 
 
 @registry.lemmatizers("spacy.Lemmatizer.v1")
-def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
-    return Lemmatizer(data_paths=data_paths)
+def create_lemmatizer(data: Dict[str, str] = {}) -> "Lemmatizer":
+    return Lemmatizer(data=data)
 
 
 class Lemmatizer:
@@ -19,14 +18,10 @@ class Lemmatizer:
     DOCS: https://spacy.io/api/lemmatizer
     """
 
-    @classmethod
-    def load(cls, *args, **kwargs):
-        raise NotImplementedError(Errors.E172)
-
     def __init__(
         self,
         lookups: Optional[Lookups] = None,
-        data_paths: dict = SimpleFrozenDict(),
+        data: Dict[str, dict] = {},
         is_base_form: Optional[Callable] = None,
     ) -> None:
         """Initialize a Lemmatizer.
@@ -36,9 +31,9 @@ class Lemmatizer:
         RETURNS (Lemmatizer): The newly constructed object.
         """
         self.lookups = lookups if lookups is not None else Lookups()
-        for name, filename in data_paths.items():
-            data = load_language_data(filename)
-            self.lookups.add_table(name, data)
+        for name, table in data.items():
+            if table is not None:
+                self.lookups.add_table(name, table)
         self.is_base_form = is_base_form
 
     def __call__(
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index edaf874a3..25461b4b7 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -251,11 +251,11 @@ cdef class Lexeme:
     property cluster:
         """RETURNS (int): Brown cluster ID."""
         def __get__(self):
-            cluster_table = self.vocab.load_extra_lookups("lexeme_cluster")
+            cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
             return cluster_table.get(self.c.orth, 0)
 
         def __set__(self, int x):
-            cluster_table = self.vocab.load_extra_lookups("lexeme_cluster")
+            cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
             cluster_table[self.c.orth] = x
 
     property lang:
@@ -270,13 +270,13 @@ cdef class Lexeme:
         """RETURNS (float): Smoothed log probability estimate of the lexeme's
             type."""
         def __get__(self):
-            prob_table = self.vocab.load_extra_lookups("lexeme_prob")
-            settings_table = self.vocab.load_extra_lookups("lexeme_settings")
+            prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
+            settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
             default_oov_prob = settings_table.get("oov_prob", -20.0)
             return prob_table.get(self.c.orth, default_oov_prob)
 
         def __set__(self, float x):
-            prob_table = self.vocab.load_extra_lookups("lexeme_prob")
+            prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
             prob_table[self.c.orth] = x
 
     property lower_:
diff --git a/spacy/lookups.py b/spacy/lookups.py
index b03a326b6..d5def882e 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -5,7 +5,7 @@ from preshed.bloom import BloomFilter
 from collections import OrderedDict
 
 from .errors import Errors
-from .util import SimpleFrozenDict, ensure_path, registry
+from .util import SimpleFrozenDict, ensure_path, registry, load_language_data
 from .strings import get_string_id
 
 
@@ -13,18 +13,26 @@ UNSET = object()
 
 
 @registry.language_data("spacy-lookups-data")
-def get_lookups(lang: str) -> Dict[str, Any]:
+def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]:
     """Load the data from the spacy-lookups-data package for a given language,
     if available. Returns an empty dict if there's no data or if the package
     is not installed.
 
     lang (str): The language code (corresponds to entry point exposed by
         the spacy-lookups-data package).
+    tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"]
     RETURNS (Dict[str, Any]): The lookups, keyed by table name.
     """
-    if lang in registry.lookups:
-        return registry.lookups.get(lang)
-    return {}
+    # TODO: import spacy_lookups_data instead of going via entry points here?
+    if lang not in registry.lookups:
+        return {}
+    data = registry.lookups.get(lang)
+    result = {}
+    for table in tables:
+        if table not in data:
+            raise ValueError("TODO: unknown table")
+        result[table] = load_language_data(data[table])
+    return result
 
 
 class Lookups:
diff --git a/spacy/schemas.py b/spacy/schemas.py
index bd4939392..ba5e812ee 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -243,6 +243,7 @@ class ConfigSchemaNlp(BaseModel):
     writing_system: ConfigSchemaNlpWritingSystem = Field(..., title="The language's writing system")
     stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop")
     lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)")
+    vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
     # fmt: on
 
     class Config:
diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py
index 44f540132..3c904cb01 100644
--- a/spacy/tests/test_lemmatizer.py
+++ b/spacy/tests/test_lemmatizer.py
@@ -5,6 +5,7 @@ from spacy.lookups import Lookups
 from spacy.lemmatizer import Lemmatizer
 
 
+@pytest.mark.skip(reason="We probably don't want to support this anymore in v3?")
 def test_lemmatizer_reflects_lookups_changes():
     """Test for an issue that'd cause lookups available in a model loaded from
     disk to not be reflected in the lemmatizer."""
@@ -56,4 +57,8 @@ def test_lemmatizer_without_is_base_form_implementation():
     lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}})
 
     lemmatizer = Lemmatizer(lookups, is_base_form=None)
-    assert lemmatizer("Formuesskatten", "noun", {'Definite': 'def', 'Gender': 'masc', 'Number': 'sing'}) == ["formuesskatt"]
+    assert lemmatizer(
+        "Formuesskatten",
+        "noun",
+        {"Definite": "def", "Gender": "masc", "Number": "sing"},
+    ) == ["formuesskatt"]
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index f93b6cffe..a31c984ad 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -29,7 +29,6 @@ cdef class Vocab:
     cpdef public Morphology morphology
     cpdef public object vectors
     cpdef public object lookups
-    cpdef public object lookups_extra
     cpdef public object writing_system
     cdef readonly int length
     cdef public object data_dir
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 3ab90dd2f..1afee4f69 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -31,7 +31,7 @@ cdef class Vocab:
     DOCS: https://spacy.io/api/vocab
     """
     def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
-                 strings=tuple(), lookups=None, lookups_extra=None,
+                 strings=tuple(), lookups=None, vocab_data={},
                  oov_prob=-20., vectors_name=None, writing_system={},
                  **deprecated_kwargs):
         """Create the vocabulary.
@@ -44,7 +44,6 @@ cdef class Vocab:
         strings (StringStore): StringStore that maps strings to integers, and
             vice versa.
         lookups (Lookups): Container for large lookup tables and dictionaries.
-        lookups_extra (Lookups): Container for optional lookup tables and dictionaries.
         oov_prob (float): Default OOV probability.
         vectors_name (unicode): Optional name to identify the vectors table.
         RETURNS (Vocab): The newly constructed object.
@@ -53,12 +52,12 @@ cdef class Vocab:
         tag_map = tag_map if tag_map is not None else {}
         if lookups in (None, True, False):
             lookups = Lookups()
-        if "lexeme_norm" not in lookups:
-            lookups.add_table("lexeme_norm")
+        for name, data in vocab_data.items():
+            if name not in lookups:
+                data = data if data is not None else {}
+                lookups.add_table(name, data)
         if lemmatizer in (None, True, False):
             lemmatizer = Lemmatizer(lookups)
-        if lookups_extra in (None, True, False):
-            lookups_extra = Lookups()
         self.cfg = {'oov_prob': oov_prob}
         self.mem = Pool()
         self._by_orth = PreshMap()
@@ -71,7 +70,6 @@ cdef class Vocab:
         self.morphology = Morphology(self.strings, tag_map, lemmatizer)
         self.vectors = Vectors(name=vectors_name)
         self.lookups = lookups
-        self.lookups_extra = lookups_extra
         self.writing_system = writing_system
 
     @property
@@ -425,6 +423,7 @@ cdef class Vocab:
         lemmatizer=None,
         lex_attr_getters=None,
         stop_words=None,
+        vocab_data=None,
         vectors_name=None,
         tag_map=None,
         morph_rules=None
@@ -444,12 +443,12 @@ cdef class Vocab:
         if not lemmatizer:
             lemma_cfg = {"lemmatizer": config["nlp"]["lemmatizer"]}
             lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
-        lookups = lemmatizer.lookups
-        if "lexeme_norm" not in lookups:
-            lookups.add_table("lexeme_norm")
         if stop_words is None:
             stop_words_cfg = {"stop_words": config["nlp"]["stop_words"]}
             stop_words = registry.make_from_config(stop_words_cfg)["stop_words"]
+        if vocab_data is None:
+            vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]}
+            vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"]
         if lex_attr_getters is None:
             lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]}
             lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"]
@@ -462,14 +461,12 @@ cdef class Vocab:
         lex_attrs[NORM] = util.add_lookups(
             lex_attrs.get(NORM, LEX_ATTRS[NORM]),
             BASE_NORMS,
-            # TODO: we need to move the lexeme norms to their own entry
-            # points so we can specify them separately from the lemma lookups
-            lookups.get_table("lexeme_norm"),
+            vocab_data.get("lexeme_norm", {}),
         )
         vocab = cls(
             lex_attr_getters=lex_attrs,
+            vocab_data=vocab_data,
             lemmatizer=lemmatizer,
-            lookups=lookups,
             writing_system=writing_system,
             tag_map=tag_map,
         )
@@ -498,8 +495,6 @@ cdef class Vocab:
             self.vectors.to_disk(path)
         if "lookups" not in "exclude" and self.lookups is not None:
             self.lookups.to_disk(path)
-        if "lookups_extra" not in "exclude" and self.lookups_extra is not None:
-            self.lookups_extra.to_disk(path, filename="lookups_extra.bin")
 
     def from_disk(self, path, exclude=tuple()):
         """Loads state from a directory. Modifies the object in place and
@@ -522,8 +517,6 @@ cdef class Vocab:
                 link_vectors_to_models(self)
         if "lookups" not in exclude:
             self.lookups.from_disk(path)
-        if "lookups_extra" not in exclude:
-            self.lookups_extra.from_disk(path, filename="lookups_extra.bin")
         if "lexeme_norm" in self.lookups:
             self.lex_attr_getters[NORM] = util.add_lookups(
                 self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
@@ -550,7 +543,6 @@ cdef class Vocab:
             "strings": lambda: self.strings.to_bytes(),
             "vectors": deserialize_vectors,
             "lookups": lambda: self.lookups.to_bytes(),
-            "lookups_extra": lambda: self.lookups_extra.to_bytes()
         }
         return util.to_bytes(getters, exclude)
 
@@ -574,7 +566,6 @@ cdef class Vocab:
             "lexemes": lambda b: self.lexemes_from_bytes(b),
             "vectors": lambda b: serialize_vectors(b),
             "lookups": lambda b: self.lookups.from_bytes(b),
-            "lookups_extra": lambda b: self.lookups_extra.from_bytes(b)
         }
         util.from_bytes(bytes_data, setters, exclude)
         if "lexeme_norm" in self.lookups:
@@ -592,19 +583,6 @@ cdef class Vocab:
         raise NotImplementedError
 
 
-    def load_extra_lookups(self, table_name):
-        if table_name not in self.lookups_extra:
-            if self.lang + "_extra" in util.registry.lookups:
-                tables = util.registry.lookups.get(self.lang + "_extra")
-                for name, filename in tables.items():
-                    if table_name == name:
-                        data = util.load_language_data(filename)
-                        self.lookups_extra.add_table(name, data)
-        if table_name not in self.lookups_extra:
-            self.lookups_extra.add_table(table_name)
-        return self.lookups_extra.get_table(table_name)
-
-
 def pickle_vocab(vocab):
     sstore = vocab.strings
     vectors = vocab.vectors
@@ -612,13 +590,12 @@ def pickle_vocab(vocab):
     data_dir = vocab.data_dir
     lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
     lookups = vocab.lookups
-    lookups_extra = vocab.lookups_extra
     return (unpickle_vocab,
-            (sstore, vectors, morph, data_dir, lex_attr_getters, lookups, lookups_extra))
+            (sstore, vectors, morph, data_dir, lex_attr_getters, lookups))
 
 
 def unpickle_vocab(sstore, vectors, morphology, data_dir,
-                   lex_attr_getters, lookups, lookups_extra):
+                   lex_attr_getters, lookups):
     cdef Vocab vocab = Vocab()
     vocab.vectors = vectors
     vocab.strings = sstore
@@ -626,7 +603,6 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir,
     vocab.data_dir = data_dir
     vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
     vocab.lookups = lookups
-    vocab.lookups_extra = lookups_extra
     return vocab