Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups * Move non-derivable lexemes features (`norm / cluster / prob`) to `spacy-lookups-data` as lookups * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in lookups only * Remove serialization of lexemes data as `vocab/lexemes.bin` * Remove `SerializedLexemeC` * Remove `Lexeme.to_bytes/from_bytes` * Modify normalization exception loading: * Always create `Vocab.lookups` table `lexeme_norm` for normalization exceptions * Load base exceptions from `lang.norm_exceptions`, but load language-specific exceptions from lookups * Set `lex_attr_getter[NORM]` including new lookups table in `BaseDefaults.create_vocab()` and when deserializing `Vocab` * Remove all cached lexemes when deserializing vocab to override existing normalizations with the new normalizations (as a replacement for the previous step that replaced all lexemes data with the deserialized data) * Skip English normalization test Skip English normalization test because the data is now in `spacy-lookups-data`. * Remove norm exceptions Moved to spacy-lookups-data. * Move norm exceptions test to spacy-lookups-data * Load extra lookups from spacy-lookups-data lazily Load extra lookups (currently for cluster and prob) lazily from the entry point `lg_extra` as `Vocab.lookups_extra`. * Skip creating lexeme cache on load To improve model loading times, do not create the full lexeme cache when loading. The lexemes will be created on demand when processing. * Identify numeric values in Lexeme.set_attrs() With the removal of a special case for `PROB`, also identify `float` to avoid trying to convert it with the `StringStore`. * Skip lexeme cache init in from_bytes * Unskip and update lookups tests for python3.6+ * Update vocab pickle to include lookups_extra * Update vocab serialization tests Check strings rather than lexemes since lexemes aren't initialized automatically, account for addition of "_SP". * Re-skip lookups test because of python3.5 * Skip PROB/float values in Lexeme.set_attrs * Convert is_oov from lexeme flag to lex in vectors Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether the lexeme has a vector. Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2026-03-06 21:01:34 +03:00 · 2020-05-19 15:59:14 +02:00 · 2020-05-19 15:59:14 +02:00 · a5cd203284
commit a5cd203284
parent a41e28ceba
45 changed files with 161 additions and 6182 deletions
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -15,7 +15,7 @@ cdef enum attr_id_t:
    LIKE_NUM
    LIKE_EMAIL
    IS_STOP
-    IS_OOV
+    IS_OOV_DEPRECATED
    IS_BRACKET
    IS_QUOTE
    IS_LEFT_PUNCT
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -16,7 +16,7 @@ IDS = {
    "LIKE_NUM": LIKE_NUM,
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
-    "IS_OOV": IS_OOV,
+    "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
    "IS_BRACKET": IS_BRACKET,
    "IS_QUOTE": IS_QUOTE,
    "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -157,15 +157,11 @@ def create_model(lang, lex_attrs, name=None):
    nlp = lang_class()
    for lexeme in nlp.vocab:
        lexeme.rank = OOV_RANK
-    lex_added = 0
    for attrs in lex_attrs:
        if "settings" in attrs:
            continue
        lexeme = nlp.vocab[attrs["orth"]]
        lexeme.set_attrs(**attrs)
-        lexeme.is_oov = False
-        lex_added += 1
-        lex_added += 1
    if len(nlp.vocab):
        oov_prob = min(lex.prob for lex in nlp.vocab) - 1
    else:
@ -193,8 +189,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
        if vector_keys is not None:
            for word in vector_keys:
                if word not in nlp.vocab:
-                    lexeme = nlp.vocab[word]
-                    lexeme.is_oov = False
+                    nlp.vocab[word]
        if vectors_data is not None:
            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
    if name is None:
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -15,7 +15,6 @@ import random

 from .._ml import create_default_optimizer
 from ..util import use_gpu as set_gpu
-from ..attrs import PROB, IS_OOV, CLUSTER, LANG
 from ..gold import GoldCorpus
 from ..compat import path2str
 from .. import util
@ -630,15 +629,6 @@ def _create_progress_bar(total):

 def _load_vectors(nlp, vectors):
    util.load_model(vectors, vocab=nlp.vocab)
-    for lex in nlp.vocab:
-        values = {}
-        for attr, func in nlp.vocab.lex_attr_getters.items():
-            # These attrs are expected to be set by data. Others should
-            # be set by calling the language functions.
-            if attr not in (CLUSTER, PROB, IS_OOV, LANG):
-                values[lex.vocab.strings[attr]] = func(lex.orth_)
-        lex.set_attrs(**values)
-        lex.is_oov = False


 def _load_pretrained_tok2vec(nlp, loc):
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -2,7 +2,6 @@
 from __future__ import unicode_literals

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -12,17 +11,14 @@ from ..tag_map import TAG_MAP
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc


 class DanishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "da"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    morph_rules = MORPH_RULES
    infixes = TOKENIZER_INFIXES
--- a/spacy/lang/da/norm_exceptions.py
+++ b/spacy/lang/da/norm_exceptions.py
@ -1,527 +0,0 @@
-# coding: utf8
-"""
-Special-case rules for normalizing tokens to improve the model's predictions.
-For example 'mysterium' vs 'mysterie' and similar.
-"""
-from __future__ import unicode_literals
-
-
-# Sources:
-# 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/
-# 2: http://www.tjerry-korrektur.dk/ord-med-flere-stavemaader/
-
-_exc = {
-    # Alternative spelling
-    "a-kraft-værk": "a-kraftværk",  # 1
-    "ålborg": "aalborg",  # 2
-    "århus": "aarhus",
-    "accessoirer": "accessoires",  # 1
-    "affektert": "affekteret",  # 1
-    "afrikander": "afrikaaner",  # 1
-    "aftabuere": "aftabuisere",  # 1
-    "aftabuering": "aftabuisering",  # 1
-    "akvarium": "akvarie",  # 1
-    "alenefader": "alenefar",  # 1
-    "alenemoder": "alenemor",  # 1
-    "alkoholambulatorium": "alkoholambulatorie",  # 1
-    "ambulatorium": "ambulatorie",  # 1
-    "ananassene": "ananasserne",  # 2
-    "anførelsestegn": "anførselstegn",  # 1
-    "anseelig": "anselig",  # 2
-    "antioxydant": "antioxidant",  # 1
-    "artrig": "artsrig",  # 1
-    "auditorium": "auditorie",  # 1
-    "avocado": "avokado",  # 2
-    "bagerst": "bagest",  # 2
-    "bagstræv": "bagstræb",  # 1
-    "bagstræver": "bagstræber",  # 1
-    "bagstræverisk": "bagstræberisk",  # 1
-    "balde": "balle",  # 2
-    "barselorlov": "barselsorlov",  # 1
-    "barselvikar": "barselsvikar",  # 1
-    "baskien": "baskerlandet",  # 1
-    "bayrisk": "bayersk",  # 1
-    "bedstefader": "bedstefar",  # 1
-    "bedstemoder": "bedstemor",  # 1
-    "behefte": "behæfte",  # 1
-    "beheftelse": "behæftelse",  # 1
-    "bidragydende": "bidragsydende",  # 1
-    "bidragyder": "bidragsyder",  # 1
-    "billiondel": "billiontedel",  # 1
-    "blaseret": "blasert",  # 1
-    "bleskifte": "bleskift",  # 1
-    "blodbroder": "blodsbroder",  # 2
-    "blyantspidser": "blyantsspidser",  # 2
-    "boligministerium": "boligministerie",  # 1
-    "borhul": "borehul",  # 1
-    "broder": "bror",  # 2
-    "buldog": "bulldog",  # 2
-    "bådhus": "bådehus",  # 1
-    "børnepleje": "barnepleje",  # 1
-    "børneseng": "barneseng",  # 1
-    "børnestol": "barnestol",  # 1
-    "cairo": "kairo",  # 1
-    "cambodia": "cambodja",  # 1
-    "cambodianer": "cambodjaner",  # 1
-    "cambodiansk": "cambodjansk",  # 1
-    "camouflage": "kamuflage",  # 2
-    "campylobacter": "kampylobakter",  # 1
-    "centeret": "centret",  # 2
-    "chefskahyt": "chefkahyt",  # 1
-    "chefspost": "chefpost",  # 1
-    "chefssekretær": "chefsekretær",  # 1
-    "chefsstol": "chefstol",  # 1
-    "cirkulærskrivelse": "cirkulæreskrivelse",  # 1
-    "cognacsglas": "cognacglas",  # 1
-    "columnist": "kolumnist",  # 1
-    "cricket": "kricket",  # 2
-    "dagplejemoder": "dagplejemor",  # 1
-    "damaskesdug": "damaskdug",  # 1
-    "damp-barn": "dampbarn",  # 1
-    "delfinarium": "delfinarie",  # 1
-    "dentallaboratorium": "dentallaboratorie",  # 1
-    "diaramme": "diasramme",  # 1
-    "diaré": "diarré",  # 1
-    "dioxyd": "dioxid",  # 1
-    "dommedagsprædiken": "dommedagspræken",  # 1
-    "donut": "doughnut",  # 2
-    "driftmæssig": "driftsmæssig",  # 1
-    "driftsikker": "driftssikker",  # 1
-    "driftsikring": "driftssikring",  # 1
-    "drikkejogurt": "drikkeyoghurt",  # 1
-    "drivein": "drive-in",  # 1
-    "driveinbiograf": "drive-in-biograf",  # 1
-    "drøvel": "drøbel",  # 1
-    "dødskriterium": "dødskriterie",  # 1
-    "e-mail-adresse": "e-mailadresse",  # 1
-    "e-post-adresse": "e-postadresse",  # 1
-    "egypten": "ægypten",  # 2
-    "ekskommunicere": "ekskommunikere",  # 1
-    "eksperimentarium": "eksperimentarie",  # 1
-    "elsass": "Alsace",  # 1
-    "elsasser": "alsacer",  # 1
-    "elsassisk": "alsacisk",  # 1
-    "elvetal": "ellevetal",  # 1
-    "elvetiden": "ellevetiden",  # 1
-    "elveårig": "elleveårig",  # 1
-    "elveårs": "elleveårs",  # 1
-    "elveårsbarn": "elleveårsbarn",  # 1
-    "elvte": "ellevte",  # 1
-    "elvtedel": "ellevtedel",  # 1
-    "energiministerium": "energiministerie",  # 1
-    "erhvervsministerium": "erhvervsministerie",  # 1
-    "espaliere": "spaliere",  # 2
-    "evangelium": "evangelie",  # 1
-    "fagministerium": "fagministerie",  # 1
-    "fakse": "faxe",  # 1
-    "fangstkvota": "fangstkvote",  # 1
-    "fader": "far",  # 2
-    "farbroder": "farbror",  # 1
-    "farfader": "farfar",  # 1
-    "farmoder": "farmor",  # 1
-    "federal": "føderal",  # 1
-    "federalisering": "føderalisering",  # 1
-    "federalisme": "føderalisme",  # 1
-    "federalist": "føderalist",  # 1
-    "federalistisk": "føderalistisk",  # 1
-    "federation": "føderation",  # 1
-    "federativ": "føderativ",  # 1
-    "fejlbeheftet": "fejlbehæftet",  # 1
-    "femetagers": "femetages",  # 2
-    "femhundredekroneseddel": "femhundredkroneseddel",  # 2
-    "filmpremiere": "filmpræmiere",  # 2
-    "finansimperium": "finansimperie",  # 1
-    "finansministerium": "finansministerie",  # 1
-    "firehjulstræk": "firhjulstræk",  # 2
-    "fjernstudium": "fjernstudie",  # 1
-    "formalier": "formalia",  # 1
-    "formandsskift": "formandsskifte",  # 1
-    "fornemst": "fornemmest",  # 2
-    "fornuftparti": "fornuftsparti",  # 1
-    "fornuftstridig": "fornuftsstridig",  # 1
-    "fornuftvæsen": "fornuftsvæsen",  # 1
-    "fornuftægteskab": "fornuftsægteskab",  # 1
-    "forretningsministerium": "forretningsministerie",  # 1
-    "forskningsministerium": "forskningsministerie",  # 1
-    "forstudium": "forstudie",  # 1
-    "forsvarsministerium": "forsvarsministerie",  # 1
-    "frilægge": "fritlægge",  # 1
-    "frilæggelse": "fritlæggelse",  # 1
-    "frilægning": "fritlægning",  # 1
-    "fristille": "fritstille",  # 1
-    "fristilling": "fritstilling",  # 1
-    "fuldttegnet": "fuldtegnet",  # 1
-    "fødestedskriterium": "fødestedskriterie",  # 1
-    "fødevareministerium": "fødevareministerie",  # 1
-    "følesløs": "følelsesløs",  # 1
-    "følgeligt": "følgelig",  # 1
-    "førne": "førn",  # 1
-    "gearskift": "gearskifte",  # 2
-    "gladeligt": "gladelig",  # 1
-    "glosehefte": "glosehæfte",  # 1
-    "glædeløs": "glædesløs",  # 1
-    "gonoré": "gonorré",  # 1
-    "grangiveligt": "grangivelig",  # 1
-    "grundliggende": "grundlæggende",  # 2
-    "grønsag": "grøntsag",  # 2
-    "gudbenådet": "gudsbenådet",  # 1
-    "gudfader": "gudfar",  # 1
-    "gudmoder": "gudmor",  # 1
-    "gulvmop": "gulvmoppe",  # 1
-    "gymnasium": "gymnasie",  # 1
-    "hackning": "hacking",  # 1
-    "halvbroder": "halvbror",  # 1
-    "halvelvetiden": "halvellevetiden",  # 1
-    "handelsgymnasium": "handelsgymnasie",  # 1
-    "hefte": "hæfte",  # 1
-    "hefteklamme": "hæfteklamme",  # 1
-    "heftelse": "hæftelse",  # 1
-    "heftemaskine": "hæftemaskine",  # 1
-    "heftepistol": "hæftepistol",  # 1
-    "hefteplaster": "hæfteplaster",  # 1
-    "heftestraf": "hæftestraf",  # 1
-    "heftning": "hæftning",  # 1
-    "helbroder": "helbror",  # 1
-    "hjemmeklasse": "hjemklasse",  # 1
-    "hjulspin": "hjulspind",  # 1
-    "huggevåben": "hugvåben",  # 1
-    "hulmurisolering": "hulmursisolering",  # 1
-    "hurtiggående": "hurtigtgående",  # 2
-    "hurtigttørrende": "hurtigtørrende",  # 2
-    "husmoder": "husmor",  # 1
-    "hydroxyd": "hydroxid",  # 1
-    "håndmikser": "håndmixer",  # 1
-    "højtaler": "højttaler",  # 2
-    "hønemoder": "hønemor",  # 1
-    "ide": "idé",  # 2
-    "imperium": "imperie",  # 1
-    "imponerthed": "imponerethed",  # 1
-    "inbox": "indboks",  # 2
-    "indenrigsministerium": "indenrigsministerie",  # 1
-    "indhefte": "indhæfte",  # 1
-    "indheftning": "indhæftning",  # 1
-    "indicium": "indicie",  # 1
-    "indkassere": "inkassere",  # 2
-    "iota": "jota",  # 1
-    "jobskift": "jobskifte",  # 1
-    "jogurt": "yoghurt",  # 1
-    "jukeboks": "jukebox",  # 1
-    "justitsministerium": "justitsministerie",  # 1
-    "kalorifere": "kalorifer",  # 1
-    "kandidatstipendium": "kandidatstipendie",  # 1
-    "kannevas": "kanvas",  # 1
-    "kaperssauce": "kaperssovs",  # 1
-    "kigge": "kikke",  # 2
-    "kirkeministerium": "kirkeministerie",  # 1
-    "klapmydse": "klapmyds",  # 1
-    "klimakterium": "klimakterie",  # 1
-    "klogeligt": "klogelig",  # 1
-    "knivblad": "knivsblad",  # 1
-    "kollegaer": "kolleger",  # 2
-    "kollegium": "kollegie",  # 1
-    "kollegiehefte": "kollegiehæfte",  # 1
-    "kollokviumx": "kollokvium",  # 1
-    "kommissorium": "kommissorie",  # 1
-    "kompendium": "kompendie",  # 1
-    "komplicerthed": "komplicerethed",  # 1
-    "konfederation": "konføderation",  # 1
-    "konfedereret": "konfødereret",  # 1
-    "konferensstudium": "konferensstudie",  # 1
-    "konservatorium": "konservatorie",  # 1
-    "konsulere": "konsultere",  # 1
-    "kradsbørstig": "krasbørstig",  # 2
-    "kravsspecifikation": "kravspecifikation",  # 1
-    "krematorium": "krematorie",  # 1
-    "krep": "crepe",  # 1
-    "krepnylon": "crepenylon",  # 1
-    "kreppapir": "crepepapir",  # 1
-    "kricket": "cricket",  # 2
-    "kriterium": "kriterie",  # 1
-    "kroat": "kroater",  # 2
-    "kroki": "croquis",  # 1
-    "kronprinsepar": "kronprinspar",  # 2
-    "kropdoven": "kropsdoven",  # 1
-    "kroplus": "kropslus",  # 1
-    "krøllefedt": "krølfedt",  # 1
-    "kulturministerium": "kulturministerie",  # 1
-    "kuponhefte": "kuponhæfte",  # 1
-    "kvota": "kvote",  # 1
-    "kvotaordning": "kvoteordning",  # 1
-    "laboratorium": "laboratorie",  # 1
-    "laksfarve": "laksefarve",  # 1
-    "laksfarvet": "laksefarvet",  # 1
-    "laksrød": "lakserød",  # 1
-    "laksyngel": "lakseyngel",  # 1
-    "laksørred": "lakseørred",  # 1
-    "landbrugsministerium": "landbrugsministerie",  # 1
-    "landskampstemning": "landskampsstemning",  # 1
-    "langust": "languster",  # 1
-    "lappegrejer": "lappegrej",  # 1
-    "lavløn": "lavtløn",  # 1
-    "lillebroder": "lillebror",  # 1
-    "linear": "lineær",  # 1
-    "loftlampe": "loftslampe",  # 2
-    "log-in": "login",  # 1
-    "login": "log-in",  # 2
-    "lovmedholdig": "lovmedholdelig",  # 1
-    "ludder": "luder",  # 2
-    "lysholder": "lyseholder",  # 1
-    "lægeskifte": "lægeskift",  # 1
-    "lærvillig": "lærevillig",  # 1
-    "løgsauce": "løgsovs",  # 1
-    "madmoder": "madmor",  # 1
-    "majonæse": "mayonnaise",  # 1
-    "mareridtagtig": "mareridtsagtig",  # 1
-    "margen": "margin",  # 2
-    "martyrium": "martyrie",  # 1
-    "mellemstatlig": "mellemstatslig",  # 1
-    "menneskene": "menneskerne",  # 2
-    "metropolis": "metropol",  # 1
-    "miks": "mix",  # 1
-    "mikse": "mixe",  # 1
-    "miksepult": "mixerpult",  # 1
-    "mikser": "mixer",  # 1
-    "mikserpult": "mixerpult",  # 1
-    "mikslån": "mixlån",  # 1
-    "miksning": "mixning",  # 1
-    "miljøministerium": "miljøministerie",  # 1
-    "milliarddel": "milliardtedel",  # 1
-    "milliondel": "milliontedel",  # 1
-    "ministerium": "ministerie",  # 1
-    "mop": "moppe",  # 1
-    "moder": "mor",  # 2
-    "moratorium": "moratorie",  # 1
-    "morbroder": "morbror",  # 1
-    "morfader": "morfar",  # 1
-    "mormoder": "mormor",  # 1
-    "musikkonservatorium": "musikkonservatorie",  # 1
-    "muslingskal": "muslingeskal",  # 1
-    "mysterium": "mysterie",  # 1
-    "naturalieydelse": "naturalydelse",  # 1
-    "naturalieøkonomi": "naturaløkonomi",  # 1
-    "navnebroder": "navnebror",  # 1
-    "nerium": "nerie",  # 1
-    "nådeløs": "nådesløs",  # 1
-    "nærforestående": "nærtforestående",  # 1
-    "nærstående": "nærtstående",  # 1
-    "observatorium": "observatorie",  # 1
-    "oldefader": "oldefar",  # 1
-    "oldemoder": "oldemor",  # 1
-    "opgraduere": "opgradere",  # 1
-    "opgraduering": "opgradering",  # 1
-    "oratorium": "oratorie",  # 1
-    "overbookning": "overbooking",  # 1
-    "overpræsidium": "overpræsidie",  # 1
-    "overstatlig": "overstatslig",  # 1
-    "oxyd": "oxid",  # 1
-    "oxydere": "oxidere",  # 1
-    "oxydering": "oxidering",  # 1
-    "pakkenellike": "pakkenelliker",  # 1
-    "papirtynd": "papirstynd",  # 1
-    "pastoralseminarium": "pastoralseminarie",  # 1
-    "peanutsene": "peanuttene",  # 2
-    "penalhus": "pennalhus",  # 2
-    "pensakrav": "pensumkrav",  # 1
-    "pepperoni": "peperoni",  # 1
-    "peruaner": "peruvianer",  # 1
-    "petrole": "petrol",  # 1
-    "piltast": "piletast",  # 1
-    "piltaste": "piletast",  # 1
-    "planetarium": "planetarie",  # 1
-    "plasteret": "plastret",  # 2
-    "plastic": "plastik",  # 2
-    "play-off-kamp": "playoffkamp",  # 1
-    "plejefader": "plejefar",  # 1
-    "plejemoder": "plejemor",  # 1
-    "podium": "podie",  # 2
-    "praha": "prag",  # 2
-    "preciøs": "pretiøs",  # 2
-    "privilegium": "privilegie",  # 1
-    "progredere": "progrediere",  # 1
-    "præsidium": "præsidie",  # 1
-    "psykodelisk": "psykedelisk",  # 1
-    "pudsegrejer": "pudsegrej",  # 1
-    "referensgruppe": "referencegruppe",  # 1
-    "referensramme": "referenceramme",  # 1
-    "refugium": "refugie",  # 1
-    "registeret": "registret",  # 2
-    "remedium": "remedie",  # 1
-    "remiks": "remix",  # 1
-    "reservert": "reserveret",  # 1
-    "ressortministerium": "ressortministerie",  # 1
-    "ressource": "resurse",  # 2
-    "resætte": "resette",  # 1
-    "rettelig": "retteligt",  # 1
-    "rettetaste": "rettetast",  # 1
-    "returtaste": "returtast",  # 1
-    "risici": "risikoer",  # 2
-    "roll-on": "rollon",  # 1
-    "rollehefte": "rollehæfte",  # 1
-    "rostbøf": "roastbeef",  # 1
-    "rygsæksturist": "rygsækturist",  # 1
-    "rødstjært": "rødstjert",  # 1
-    "saddel": "sadel",  # 2
-    "samaritan": "samaritaner",  # 2
-    "sanatorium": "sanatorie",  # 1
-    "sauce": "sovs",  # 1
-    "scanning": "skanning",  # 2
-    "sceneskifte": "sceneskift",  # 1
-    "scilla": "skilla",  # 1
-    "sejflydende": "sejtflydende",  # 1
-    "selvstudium": "selvstudie",  # 1
-    "seminarium": "seminarie",  # 1
-    "sennepssauce": "sennepssovs ",  # 1
-    "servitutbeheftet": "servitutbehæftet",  # 1
-    "sit-in": "sitin",  # 1
-    "skatteministerium": "skatteministerie",  # 1
-    "skifer": "skiffer",  # 2
-    "skyldsfølelse": "skyldfølelse",  # 1
-    "skysauce": "skysovs",  # 1
-    "sladdertaske": "sladretaske",  # 2
-    "sladdervorn": "sladrevorn",  # 2
-    "slagsbroder": "slagsbror",  # 1
-    "slettetaste": "slettetast",  # 1
-    "smørsauce": "smørsovs",  # 1
-    "snitsel": "schnitzel",  # 1
-    "snobbeeffekt": "snobeffekt",  # 2
-    "socialministerium": "socialministerie",  # 1
-    "solarium": "solarie",  # 1
-    "soldebroder": "soldebror",  # 1
-    "spagetti": "spaghetti",  # 1
-    "spagettistrop": "spaghettistrop",  # 1
-    "spagettiwestern": "spaghettiwestern",  # 1
-    "spin-off": "spinoff",  # 1
-    "spinnefiskeri": "spindefiskeri",  # 1
-    "spolorm": "spoleorm",  # 1
-    "sproglaboratorium": "sproglaboratorie",  # 1
-    "spækbræt": "spækkebræt",  # 2
-    "stand-in": "standin",  # 1
-    "stand-up-comedy": "standupcomedy",  # 1
-    "stand-up-komiker": "standupkomiker",  # 1
-    "statsministerium": "statsministerie",  # 1
-    "stedbroder": "stedbror",  # 1
-    "stedfader": "stedfar",  # 1
-    "stedmoder": "stedmor",  # 1
-    "stilehefte": "stilehæfte",  # 1
-    "stipendium": "stipendie",  # 1
-    "stjært": "stjert",  # 1
-    "stjærthage": "stjerthage",  # 1
-    "storebroder": "storebror",  # 1
-    "stortå": "storetå",  # 1
-    "strabads": "strabadser",  # 1
-    "strømlinjet": "strømlinet",  # 1
-    "studium": "studie",  # 1
-    "stænkelap": "stænklap",  # 1
-    "sundhedsministerium": "sundhedsministerie",  # 1
-    "suppositorium": "suppositorie",  # 1
-    "svejts": "schweiz",  # 1
-    "svejtser": "schweizer",  # 1
-    "svejtserfranc": "schweizerfranc",  # 1
-    "svejtserost": "schweizerost",  # 1
-    "svejtsisk": "schweizisk",  # 1
-    "svigerfader": "svigerfar",  # 1
-    "svigermoder": "svigermor",  # 1
-    "svirebroder": "svirebror",  # 1
-    "symposium": "symposie",  # 1
-    "sælarium": "sælarie",  # 1
-    "søreme": "sørme",  # 2
-    "søterritorium": "søterritorie",  # 1
-    "t-bone-steak": "t-bonesteak",  # 1
-    "tabgivende": "tabsgivende",  # 1
-    "tabuere": "tabuisere",  # 1
-    "tabuering": "tabuisering",  # 1
-    "tackle": "takle",  # 2
-    "tackling": "takling",  # 2
-    "taifun": "tyfon",  # 1
-    "take-off": "takeoff",  # 1
-    "taknemlig": "taknemmelig",  # 2
-    "talehørelærer": "tale-høre-lærer",  # 1
-    "talehøreundervisning": "tale-høre-undervisning",  # 1
-    "tandstik": "tandstikker",  # 1
-    "tao": "dao",  # 1
-    "taoisme": "daoisme",  # 1
-    "taoist": "daoist",  # 1
-    "taoistisk": "daoistisk",  # 1
-    "taverne": "taverna",  # 1
-    "teateret": "teatret",  # 2
-    "tekno": "techno",  # 1
-    "temposkifte": "temposkift",  # 1
-    "terrarium": "terrarie",  # 1
-    "territorium": "territorie",  # 1
-    "tesis": "tese",  # 1
-    "tidsstudium": "tidsstudie",  # 1
-    "tipoldefader": "tipoldefar",  # 1
-    "tipoldemoder": "tipoldemor",  # 1
-    "tomatsauce": "tomatsovs",  # 1
-    "tonart": "toneart",  # 1
-    "trafikministerium": "trafikministerie",  # 1
-    "tredve": "tredive",  # 1
-    "tredver": "trediver",  # 1
-    "tredveårig": "trediveårig",  # 1
-    "tredveårs": "trediveårs",  # 1
-    "tredveårsfødselsdag": "trediveårsfødselsdag",  # 1
-    "tredvte": "tredivte",  # 1
-    "tredvtedel": "tredivtedel",  # 1
-    "troldunge": "troldeunge",  # 1
-    "trommestikke": "trommestik",  # 1
-    "trubadur": "troubadour",  # 2
-    "trøstepræmie": "trøstpræmie",  # 2
-    "tummerum": "trummerum",  # 1
-    "tumultuarisk": "tumultarisk",  # 1
-    "tunghørighed": "tunghørhed",  # 1
-    "tus": "tusch",  # 2
-    "tusind": "tusinde",  # 2
-    "tvillingbroder": "tvillingebror",  # 1
-    "tvillingbror": "tvillingebror",  # 1
-    "tvillingebroder": "tvillingebror",  # 1
-    "ubeheftet": "ubehæftet",  # 1
-    "udenrigsministerium": "udenrigsministerie",  # 1
-    "udhulning": "udhuling",  # 1
-    "udslaggivende": "udslagsgivende",  # 1
-    "udspekulert": "udspekuleret",  # 1
-    "udviklingsministerium": "udviklingsministerie",  # 1
-    "uforpligtigende": "uforpligtende",  # 1
-    "uheldvarslende": "uheldsvarslende",  # 1
-    "uimponerthed": "uimponerethed",  # 1
-    "undervisningsministerium": "undervisningsministerie",  # 1
-    "unægtelig": "unægteligt",  # 1
-    "urinale": "urinal",  # 1
-    "uvederheftig": "uvederhæftig",  # 1
-    "vabel": "vable",  # 2
-    "vadi": "wadi",  # 1
-    "vaklevorn": "vakkelvorn",  # 1
-    "vanadin": "vanadium",  # 1
-    "vaselin": "vaseline",  # 1
-    "vederheftig": "vederhæftig",  # 1
-    "vedhefte": "vedhæfte",  # 1
-    "velar": "velær",  # 1
-    "videndeling": "vidensdeling",  # 2
-    "vinkelanførelsestegn": "vinkelanførselstegn",  # 1
-    "vipstjært": "vipstjert",  # 1
-    "vismut": "bismut",  # 1
-    "visvas": "vissevasse",  # 1
-    "voksværk": "vokseværk",  # 1
-    "værtdyr": "værtsdyr",  # 1
-    "værtplante": "værtsplante",  # 1
-    "wienersnitsel": "wienerschnitzel",  # 1
-    "yderliggående": "yderligtgående",  # 2
-    "zombi": "zombie",  # 1
-    "ægbakke": "æggebakke",  # 1
-    "ægformet": "æggeformet",  # 1
-    "ægleder": "æggeleder",  # 1
-    "ækvilibrist": "ekvilibrist",  # 2
-    "æselsøre": "æseløre",  # 1
-    "øjehule": "øjenhule",  # 1
-    "øjelåg": "øjenlåg",  # 1
-    "øjeåbner": "øjenåbner",  # 1
-    "økonomiministerium": "økonomiministerie",  # 1
-    "ørenring": "ørering",  # 2
-    "øvehefte": "øvehæfte",  # 1
-}
-
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
-    NORM_EXCEPTIONS[string] = norm
-    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -2,7 +2,6 @@
 from __future__ import unicode_literals

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .punctuation import TOKENIZER_INFIXES
 from .tag_map import TAG_MAP
@ -10,18 +9,14 @@ from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc


 class GermanDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "de"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
--- a/spacy/lang/de/norm_exceptions.py
+++ b/spacy/lang/de/norm_exceptions.py
@ -1,16 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-# Here we only want to include the absolute most common words. Otherwise,
-# this list would get impossibly long for German – especially considering the
-# old vs. new spelling rules, and all possible cases.
-
-
-_exc = {"daß": "dass"}
-
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
-    NORM_EXCEPTIONS[string] = norm
-    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/lang/el/init.py
+++ b/spacy/lang/el/init.py
@ -10,21 +10,16 @@ from .lemmatizer import GreekLemmatizer
 from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lookups import Lookups
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc


 class GreekDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "el"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
--- a/spacy/lang/el/norm_exceptions.py
+++ b/spacy/lang/el/norm_exceptions.py
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -2,7 +2,6 @@
 from __future__ import unicode_literals

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -10,10 +9,9 @@ from .morph_rules import MORPH_RULES
 from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc


 def _return_en(_):
@ -24,9 +22,6 @@ class EnglishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = _return_en
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
--- a/spacy/lang/en/norm_exceptions.py
+++ b/spacy/lang/en/norm_exceptions.py
--- a/spacy/lang/id/init.py
+++ b/spacy/lang/id/init.py
@ -4,25 +4,20 @@ from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .tag_map import TAG_MAP

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc


 class IndonesianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "id"
    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
--- a/spacy/lang/id/norm_exceptions.py
+++ b/spacy/lang/id/norm_exceptions.py
@ -1,532 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-# Daftar kosakata yang sering salah dieja
-# https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja
-_exc = {
-    # Slang and abbreviations
-    "silahkan": "silakan",
-    "yg": "yang",
-    "kalo": "kalau",
-    "cawu": "caturwulan",
-    "ok": "oke",
-    "gak": "tidak",
-    "enggak": "tidak",
-    "nggak": "tidak",
-    "ndak": "tidak",
-    "ngga": "tidak",
-    "dgn": "dengan",
-    "tdk": "tidak",
-    "jg": "juga",
-    "klo": "kalau",
-    "denger": "dengar",
-    "pinter": "pintar",
-    "krn": "karena",
-    "nemuin": "menemukan",
-    "jgn": "jangan",
-    "udah": "sudah",
-    "sy": "saya",
-    "udh": "sudah",
-    "dapetin": "mendapatkan",
-    "ngelakuin": "melakukan",
-    "ngebuat": "membuat",
-    "membikin": "membuat",
-    "bikin": "buat",
-    # Daftar kosakata yang sering salah dieja
-    "malpraktik": "malapraktik",
-    "malfungsi": "malafungsi",
-    "malserap": "malaserap",
-    "maladaptasi": "malaadaptasi",
-    "malsuai": "malasuai",
-    "maldistribusi": "maladistribusi",
-    "malgizi": "malagizi",
-    "malsikap": "malasikap",
-    "memperhatikan": "memerhatikan",
-    "akte": "akta",
-    "cemilan": "camilan",
-    "esei": "esai",
-    "frase": "frasa",
-    "kafeteria": "kafetaria",
-    "ketapel": "katapel",
-    "kenderaan": "kendaraan",
-    "menejemen": "manajemen",
-    "menejer": "manajer",
-    "mesjid": "masjid",
-    "rebo": "rabu",
-    "seksama": "saksama",
-    "senggama": "sanggama",
-    "sekedar": "sekadar",
-    "seprei": "seprai",
-    "semedi": "semadi",
-    "samadi": "semadi",
-    "amandemen": "amendemen",
-    "algoritma": "algoritme",
-    "aritmatika": "aritmetika",
-    "metoda": "metode",
-    "materai": "meterai",
-    "meterei": "meterai",
-    "kalendar": "kalender",
-    "kadaluwarsa": "kedaluwarsa",
-    "katagori": "kategori",
-    "parlamen": "parlemen",
-    "sekular": "sekuler",
-    "selular": "seluler",
-    "sirkular": "sirkuler",
-    "survai": "survei",
-    "survey": "survei",
-    "aktuil": "aktual",
-    "formil": "formal",
-    "trotoir": "trotoar",
-    "komersiil": "komersial",
-    "komersil": "komersial",
-    "tradisionil": "tradisionial",
-    "orisinil": "orisinal",
-    "orijinil": "orisinal",
-    "afdol": "afdal",
-    "antri": "antre",
-    "apotik": "apotek",
-    "atlit": "atlet",
-    "atmosfir": "atmosfer",
-    "cidera": "cedera",
-    "cendikiawan": "cendekiawan",
-    "cepet": "cepat",
-    "cinderamata": "cenderamata",
-    "debet": "debit",
-    "difinisi": "definisi",
-    "dekrit": "dekret",
-    "disain": "desain",
-    "diskripsi": "deskripsi",
-    "diskotik": "diskotek",
-    "eksim": "eksem",
-    "exim": "eksem",
-    "faidah": "faedah",
-    "ekstrim": "ekstrem",
-    "ekstrimis": "ekstremis",
-    "komplit": "komplet",
-    "konkrit": "konkret",
-    "kongkrit": "konkret",
-    "kongkret": "konkret",
-    "kridit": "kredit",
-    "musium": "museum",
-    "pinalti": "penalti",
-    "piranti": "peranti",
-    "pinsil": "pensil",
-    "personil": "personel",
-    "sistim": "sistem",
-    "teoritis": "teoretis",
-    "vidio": "video",
-    "cengkeh": "cengkih",
-    "desertasi": "disertasi",
-    "hakekat": "hakikat",
-    "intelejen": "intelijen",
-    "kaedah": "kaidah",
-    "kempes": "kempis",
-    "kementrian": "kementerian",
-    "ledeng": "leding",
-    "nasehat": "nasihat",
-    "penasehat": "penasihat",
-    "praktek": "praktik",
-    "praktekum": "praktikum",
-    "resiko": "risiko",
-    "retsleting": "ritsleting",
-    "senen": "senin",
-    "amuba": "ameba",
-    "punggawa": "penggawa",
-    "surban": "serban",
-    "nomer": "nomor",
-    "sorban": "serban",
-    "bis": "bus",
-    "agribisnis": "agrobisnis",
-    "kantung": "kantong",
-    "khutbah": "khotbah",
-    "mandur": "mandor",
-    "rubuh": "roboh",
-    "pastur": "pastor",
-    "supir": "sopir",
-    "goncang": "guncang",
-    "goa": "gua",
-    "kaos": "kaus",
-    "kokoh": "kukuh",
-    "komulatif": "kumulatif",
-    "kolomnis": "kolumnis",
-    "korma": "kurma",
-    "lobang": "lubang",
-    "limo": "limusin",
-    "limosin": "limusin",
-    "mangkok": "mangkuk",
-    "saos": "saus",
-    "sop": "sup",
-    "sorga": "surga",
-    "tegor": "tegur",
-    "telor": "telur",
-    "obrak-abrik": "ubrak-abrik",
-    "ekwivalen": "ekuivalen",
-    "frekwensi": "frekuensi",
-    "konsekwensi": "konsekuensi",
-    "kwadran": "kuadran",
-    "kwadrat": "kuadrat",
-    "kwalifikasi": "kualifikasi",
-    "kwalitas": "kualitas",
-    "kwalitet": "kualitas",
-    "kwalitatif": "kualitatif",
-    "kwantitas": "kuantitas",
-    "kwantitatif": "kuantitatif",
-    "kwantum": "kuantum",
-    "kwartal": "kuartal",
-    "kwintal": "kuintal",
-    "kwitansi": "kuitansi",
-    "kwatir": "khawatir",
-    "kuatir": "khawatir",
-    "jadual": "jadwal",
-    "hirarki": "hierarki",
-    "karir": "karier",
-    "aktip": "aktif",
-    "daptar": "daftar",
-    "efektip": "efektif",
-    "epektif": "efektif",
-    "epektip": "efektif",
-    "Pebruari": "Februari",
-    "pisik": "fisik",
-    "pondasi": "fondasi",
-    "photo": "foto",
-    "photokopi": "fotokopi",
-    "hapal": "hafal",
-    "insap": "insaf",
-    "insyaf": "insaf",
-    "konperensi": "konferensi",
-    "kreatip": "kreatif",
-    "kreativ": "kreatif",
-    "maap": "maaf",
-    "napsu": "nafsu",
-    "negatip": "negatif",
-    "negativ": "negatif",
-    "objektip": "objektif",
-    "obyektip": "objektif",
-    "obyektif": "objektif",
-    "pasip": "pasif",
-    "pasiv": "pasif",
-    "positip": "positif",
-    "positiv": "positif",
-    "produktip": "produktif",
-    "produktiv": "produktif",
-    "sarap": "saraf",
-    "sertipikat": "sertifikat",
-    "subjektip": "subjektif",
-    "subyektip": "subjektif",
-    "subyektif": "subjektif",
-    "tarip": "tarif",
-    "transitip": "transitif",
-    "transitiv": "transitif",
-    "faham": "paham",
-    "fikir": "pikir",
-    "berfikir": "berpikir",
-    "telefon": "telepon",
-    "telfon": "telepon",
-    "telpon": "telepon",
-    "tilpon": "telepon",
-    "nafas": "napas",
-    "bernafas": "bernapas",
-    "pernafasan": "pernapasan",
-    "vermak": "permak",
-    "vulpen": "pulpen",
-    "aktifis": "aktivis",
-    "konfeksi": "konveksi",
-    "motifasi": "motivasi",
-    "Nopember": "November",
-    "propinsi": "provinsi",
-    "babtis": "baptis",
-    "jerembab": "jerembap",
-    "lembab": "lembap",
-    "sembab": "sembap",
-    "saptu": "sabtu",
-    "tekat": "tekad",
-    "bejad": "bejat",
-    "nekad": "nekat",
-    "otoped": "otopet",
-    "skuad": "skuat",
-    "jenius": "genius",
-    "marjin": "margin",
-    "marjinal": "marginal",
-    "obyek": "objek",
-    "subyek": "subjek",
-    "projek": "proyek",
-    "azas": "asas",
-    "ijasah": "ijazah",
-    "jenasah": "jenazah",
-    "plasa": "plaza",
-    "bathin": "batin",
-    "Katholik": "Katolik",
-    "orthografi": "ortografi",
-    "pathogen": "patogen",
-    "theologi": "teologi",
-    "ijin": "izin",
-    "rejeki": "rezeki",
-    "rejim": "rezim",
-    "jaman": "zaman",
-    "jamrud": "zamrud",
-    "jinah": "zina",
-    "perjinahan": "perzinaan",
-    "anugrah": "anugerah",
-    "cendrawasih": "cenderawasih",
-    "jendral": "jenderal",
-    "kripik": "keripik",
-    "krupuk": "kerupuk",
-    "ksatria": "kesatria",
-    "mentri": "menteri",
-    "negri": "negeri",
-    "Prancis": "Perancis",
-    "sebrang": "seberang",
-    "menyebrang": "menyeberang",
-    "Sumatra": "Sumatera",
-    "trampil": "terampil",
-    "isteri": "istri",
-    "justeru": "justru",
-    "perajurit": "prajurit",
-    "putera": "putra",
-    "puteri": "putri",
-    "samudera": "samudra",
-    "sastera": "sastra",
-    "sutera": "sutra",
-    "terompet": "trompet",
-    "iklas": "ikhlas",
-    "iktisar": "ikhtisar",
-    "kafilah": "khafilah",
-    "kawatir": "khawatir",
-    "kotbah": "khotbah",
-    "kusyuk": "khusyuk",
-    "makluk": "makhluk",
-    "mahluk": "makhluk",
-    "mahkluk": "makhluk",
-    "nahkoda": "nakhoda",
-    "nakoda": "nakhoda",
-    "tahta": "takhta",
-    "takhyul": "takhayul",
-    "tahyul": "takhayul",
-    "tahayul": "takhayul",
-    "akhli": "ahli",
-    "anarkhi": "anarki",
-    "kharisma": "karisma",
-    "kharismatik": "karismatik",
-    "mahsud": "maksud",
-    "makhsud": "maksud",
-    "rakhmat": "rahmat",
-    "tekhnik": "teknik",
-    "tehnik": "teknik",
-    "tehnologi": "teknologi",
-    "ikhwal": "ihwal",
-    "expor": "ekspor",
-    "extra": "ekstra",
-    "komplex": "komplek",
-    "sex": "seks",
-    "taxi": "taksi",
-    "extasi": "ekstasi",
-    "syaraf": "saraf",
-    "syurga": "surga",
-    "mashur": "masyhur",
-    "masyur": "masyhur",
-    "mahsyur": "masyhur",
-    "mashyur": "masyhur",
-    "muadzin": "muazin",
-    "adzan": "azan",
-    "ustadz": "ustaz",
-    "ustad": "ustaz",
-    "ustadzah": "ustaz",
-    "dzikir": "zikir",
-    "dzuhur": "zuhur",
-    "dhuhur": "zuhur",
-    "zhuhur": "zuhur",
-    "analisa": "analisis",
-    "diagnosa": "diagnosis",
-    "hipotesa": "hipotesis",
-    "sintesa": "sintesis",
-    "aktiviti": "aktivitas",
-    "aktifitas": "aktivitas",
-    "efektifitas": "efektivitas",
-    "komuniti": "komunitas",
-    "kreatifitas": "kreativitas",
-    "produktifitas": "produktivitas",
-    "realiti": "realitas",
-    "realita": "realitas",
-    "selebriti": "selebritas",
-    "spotifitas": "sportivitas",
-    "universiti": "universitas",
-    "utiliti": "utilitas",
-    "validiti": "validitas",
-    "dilokalisir": "dilokalisasi",
-    "didramatisir": "didramatisasi",
-    "dipolitisir": "dipolitisasi",
-    "dinetralisir": "dinetralisasi",
-    "dikonfrontir": "dikonfrontasi",
-    "mendominir": "mendominasi",
-    "koordinir": "koordinasi",
-    "proklamir": "proklamasi",
-    "terorganisir": "terorganisasi",
-    "terealisir": "terealisasi",
-    "robah": "ubah",
-    "dirubah": "diubah",
-    "merubah": "mengubah",
-    "terlanjur": "telanjur",
-    "terlantar": "telantar",
-    "penglepasan": "pelepasan",
-    "pelihatan": "penglihatan",
-    "pemukiman": "permukiman",
-    "pengrumahan": "perumahan",
-    "penyewaan": "persewaan",
-    "menyintai": "mencintai",
-    "menyolok": "mencolok",
-    "contek": "sontek",
-    "mencontek": "menyontek",
-    "pungkir": "mungkir",
-    "dipungkiri": "dimungkiri",
-    "kupungkiri": "kumungkiri",
-    "kaupungkiri": "kaumungkiri",
-    "nampak": "tampak",
-    "nampaknya": "tampaknya",
-    "nongkrong": "tongkrong",
-    "berternak": "beternak",
-    "berterbangan": "beterbangan",
-    "berserta": "beserta",
-    "berperkara": "beperkara",
-    "berpergian": "bepergian",
-    "berkerja": "bekerja",
-    "berberapa": "beberapa",
-    "terbersit": "tebersit",
-    "terpercaya": "tepercaya",
-    "terperdaya": "teperdaya",
-    "terpercik": "tepercik",
-    "terpergok": "tepergok",
-    "aksesoris": "aksesori",
-    "handal": "andal",
-    "hantar": "antar",
-    "panutan": "anutan",
-    "atsiri": "asiri",
-    "bhakti": "bakti",
-    "china": "cina",
-    "dharma": "darma",
-    "diktaktor": "diktator",
-    "eksport": "ekspor",
-    "hembus": "embus",
-    "hadits": "hadis",
-    "hadist": "hadits",
-    "harafiah": "harfiah",
-    "himbau": "imbau",
-    "import": "impor",
-    "inget": "ingat",
-    "hisap": "isap",
-    "interprestasi": "interpretasi",
-    "kangker": "kanker",
-    "konggres": "kongres",
-    "lansekap": "lanskap",
-    "maghrib": "magrib",
-    "emak": "mak",
-    "moderen": "modern",
-    "pasport": "paspor",
-    "perduli": "peduli",
-    "ramadhan": "ramadan",
-    "rapih": "rapi",
-    "Sansekerta": "Sanskerta",
-    "shalat": "salat",
-    "sholat": "salat",
-    "silahkan": "silakan",
-    "standard": "standar",
-    "hutang": "utang",
-    "zinah": "zina",
-    "ambulan": "ambulans",
-    "antartika": "sntarktika",
-    "arteri": "arteria",
-    "asik": "asyik",
-    "australi": "australia",
-    "denga": "dengan",
-    "depo": "depot",
-    "detil": "detail",
-    "ensiklopedi": "ensiklopedia",
-    "elit": "elite",
-    "frustasi": "frustrasi",
-    "gladi": "geladi",
-    "greget": "gereget",
-    "itali": "italia",
-    "karna": "karena",
-    "klenteng": "kelenteng",
-    "erling": "kerling",
-    "kontruksi": "konstruksi",
-    "masal": "massal",
-    "merk": "merek",
-    "respon": "respons",
-    "diresponi": "direspons",
-    "skak": "sekak",
-    "stir": "setir",
-    "singapur": "singapura",
-    "standarisasi": "standardisasi",
-    "varitas": "varietas",
-    "amphibi": "amfibi",
-    "anjlog": "anjlok",
-    "alpukat": "avokad",
-    "alpokat": "avokad",
-    "bolpen": "pulpen",
-    "cabe": "cabai",
-    "cabay": "cabai",
-    "ceret": "cerek",
-    "differensial": "diferensial",
-    "duren": "durian",
-    "faksimili": "faksimile",
-    "faksimil": "faksimile",
-    "graha": "gerha",
-    "goblog": "goblok",
-    "gombrong": "gombroh",
-    "horden": "gorden",
-    "korden": "gorden",
-    "gubug": "gubuk",
-    "imaginasi": "imajinasi",
-    "jerigen": "jeriken",
-    "jirigen": "jeriken",
-    "carut-marut": "karut-marut",
-    "kwota": "kuota",
-    "mahzab": "mazhab",
-    "mempesona": "memesona",
-    "milyar": "miliar",
-    "missi": "misi",
-    "nenas": "nanas",
-    "negoisasi": "negosiasi",
-    "automotif": "otomotif",
-    "pararel": "paralel",
-    "paska": "pasca",
-    "prosen": "persen",
-    "pete": "petai",
-    "petay": "petai",
-    "proffesor": "profesor",
-    "rame": "ramai",
-    "rapot": "rapor",
-    "rileks": "relaks",
-    "rileksasi": "relaksasi",
-    "renumerasi": "remunerasi",
-    "seketaris": "sekretaris",
-    "sekertaris": "sekretaris",
-    "sensorik": "sensoris",
-    "sentausa": "sentosa",
-    "strawberi": "stroberi",
-    "strawbery": "stroberi",
-    "taqwa": "takwa",
-    "tauco": "taoco",
-    "tauge": "taoge",
-    "toge": "taoge",
-    "tauladan": "teladan",
-    "taubat": "tobat",
-    "trilyun": "triliun",
-    "vissi": "visi",
-    "coklat": "cokelat",
-    "narkotika": "narkotik",
-    "oase": "oasis",
-    "politisi": "politikus",
-    "terong": "terung",
-    "wool": "wol",
-    "himpit": "impit",
-    "mujizat": "mukjizat",
-    "mujijat": "mukjizat",
-    "yag": "yang",
-}
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
-    NORM_EXCEPTIONS[string] = norm
-    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/lang/lb/init.py
+++ b/spacy/lang/lb/init.py
@ -2,26 +2,21 @@
 from __future__ import unicode_literals

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
 from .punctuation import TOKENIZER_INFIXES
 from .lex_attrs import LEX_ATTRS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc


 class LuxembourgishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "lb"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
--- a/spacy/lang/lb/norm_exceptions.py
+++ b/spacy/lang/lb/norm_exceptions.py
@ -1,16 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-# TODO
-# norm execptions: find a possibility to deal with the zillions of spelling
-# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
-# here one could include the most common spelling mistakes
-
-_exc = {"dass": "datt", "viläicht": "vläicht"}
-
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
-    NORM_EXCEPTIONS[string] = norm
-    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -186,10 +186,6 @@ def suffix(string):
    return string[-3:]


-def cluster(string):
-    return 0
-
-
 def is_alpha(string):
    return string.isalpha()

@ -218,20 +214,11 @@ def is_stop(string, stops=set()):
    return string.lower() in stops


-def is_oov(string):
-    return True
-
-
-def get_prob(string):
-    return -20.0
-
-
 LEX_ATTRS = {
    attrs.LOWER: lower,
    attrs.NORM: lower,
    attrs.PREFIX: prefix,
    attrs.SUFFIX: suffix,
-    attrs.CLUSTER: cluster,
    attrs.IS_ALPHA: is_alpha,
    attrs.IS_DIGIT: is_digit,
    attrs.IS_LOWER: is_lower,
@ -239,8 +226,6 @@ LEX_ATTRS = {
    attrs.IS_TITLE: is_title,
    attrs.IS_UPPER: is_upper,
    attrs.IS_STOP: is_stop,
-    attrs.IS_OOV: is_oov,
-    attrs.PROB: get_prob,
    attrs.LIKE_EMAIL: like_email,
    attrs.LIKE_NUM: like_num,
    attrs.IS_PUNCT: is_punct,
--- a/spacy/lang/pt/init.py
+++ b/spacy/lang/pt/init.py
@ -5,22 +5,17 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tag_map import TAG_MAP
-from .norm_exceptions import NORM_EXCEPTIONS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc


 class PortugueseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "pt"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
-    )
    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
--- a/spacy/lang/pt/norm_exceptions.py
+++ b/spacy/lang/pt/norm_exceptions.py
@ -1,23 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-# These exceptions are used to add NORM values based on a token's ORTH value.
-# Individual languages can also add their own exceptions and overwrite them -
-# for example, British vs. American spelling in English.
-
-# Norms are only set if no alternative is provided in the tokenizer exceptions.
-# Note that this does not change any other token attributes. Its main purpose
-# is to normalise the word representations so that equivalent tokens receive
-# similar representations. For example: $ and € are very different, but they're
-# both currency symbols. By normalising currency symbols to $, all symbols are
-# seen as similar, no matter how common they are in the training data.
-
-
-NORM_EXCEPTIONS = {
-    "R$": "$",  # Real
-    "r$": "$",  # Real
-    "Cz$": "$",  # Cruzado
-    "cz$": "$",  # Cruzado
-    "NCz$": "$",  # Cruzado Novo
-    "ncz$": "$",  # Cruzado Novo
-}
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -3,26 +3,21 @@ from __future__ import unicode_literals, print_function

 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .tag_map import TAG_MAP
 from .lemmatizer import RussianLemmatizer

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
-from ...util import update_exc, add_lookups
+from ...util import update_exc
 from ...language import Language
 from ...lookups import Lookups
-from ...attrs import LANG, NORM
+from ...attrs import LANG


 class RussianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "ru"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
--- a/spacy/lang/ru/norm_exceptions.py
+++ b/spacy/lang/ru/norm_exceptions.py
@ -1,36 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
-_exc = {
-    # Slang
-    "прив": "привет",
-    "дарова": "привет",
-    "дак": "так",
-    "дык": "так",
-    "здарова": "привет",
-    "пакедава": "пока",
-    "пакедаво": "пока",
-    "ща": "сейчас",
-    "спс": "спасибо",
-    "пжлст": "пожалуйста",
-    "плиз": "пожалуйста",
-    "ладненько": "ладно",
-    "лады": "ладно",
-    "лан": "ладно",
-    "ясн": "ясно",
-    "всм": "всмысле",
-    "хош": "хочешь",
-    "хаюшки": "привет",
-    "оч": "очень",
-    "че": "что",
-    "чо": "что",
-    "шо": "что",
-}
-
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
-    NORM_EXCEPTIONS[string] = norm
-    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/lang/sr/init.py
+++ b/spacy/lang/sr/init.py
@ -3,22 +3,17 @@ from __future__ import unicode_literals

 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc


 class SerbianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "sr"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
-    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS

--- a/spacy/lang/sr/norm_exceptions.py
+++ b/spacy/lang/sr/norm_exceptions.py
@ -1,26 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
-_exc = {
-    # Slang
-    "ћале": "отац",
-    "кева": "мајка",
-    "смор": "досада",
-    "кец": "јединица",
-    "тебра": "брат",
-    "штребер": "ученик",
-    "факс": "факултет",
-    "профа": "професор",
-    "бус": "аутобус",
-    "пискарало": "службеник",
-    "бакутанер": "бака",
-    "џибер": "простак",
-}
-
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
-    NORM_EXCEPTIONS[string] = norm
-    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/lang/ta/norm_exceptions.py
+++ b/spacy/lang/ta/norm_exceptions.py
@ -1,139 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-_exc = {
-    # Regional words normal
-    # Sri Lanka - wikipeadia
-    "இங்க": "இங்கே",
-    "வாங்க": "வாருங்கள்",
-    "ஒண்டு": "ஒன்று",
-    "கண்டு": "கன்று",
-    "கொண்டு": "கொன்று",
-    "பண்டி": "பன்றி",
-    "பச்ச": "பச்சை",
-    "அம்பது": "ஐம்பது",
-    "வெச்ச": "வைத்து",
-    "வச்ச": "வைத்து",
-    "வச்சி": "வைத்து",
-    "வாளைப்பழம்": "வாழைப்பழம்",
-    "மண்ணு": "மண்",
-    "பொன்னு": "பொன்",
-    "சாவல்": "சேவல்",
-    "அங்கால": "அங்கு ",
-    "அசுப்பு": "நடமாட்டம்",
-    "எழுவான் கரை": "எழுவான்கரை",
-    "ஓய்யாரம்": "எழில் ",
-    "ஒளும்பு": "எழும்பு",
-    "ஓர்மை": "துணிவு",
-    "கச்சை": "கோவணம்",
-    "கடப்பு": "தெருவாசல்",
-    "சுள்ளி": "காய்ந்த குச்சி",
-    "திறாவுதல்": "தடவுதல்",
-    "நாசமறுப்பு": "தொல்லை",
-    "பரிசாரி": "வைத்தியன்",
-    "பறவாதி": "பேராசைக்காரன்",
-    "பிசினி": "உலோபி ",
-    "விசர்": "பைத்தியம்",
-    "ஏனம்": "பாத்திரம்",
-    "ஏலா": "இயலாது",
-    "ஒசில்": "அழகு",
-    "ஒள்ளுப்பம்": "கொஞ்சம்",
-    # Srilankan and indian
-    "குத்துமதிப்பு": "",
-    "நூனாயம்": "நூல்நயம்",
-    "பைய": "மெதுவாக",
-    "மண்டை": "தலை",
-    "வெள்ளனே": "சீக்கிரம்",
-    "உசுப்பு": "எழுப்பு",
-    "ஆணம்": "குழம்பு",
-    "உறக்கம்": "தூக்கம்",
-    "பஸ்": "பேருந்து",
-    "களவு": "திருட்டு ",
-    # relationship
-    "புருசன்": "கணவன்",
-    "பொஞ்சாதி": "மனைவி",
-    "புள்ள": "பிள்ளை",
-    "பிள்ள": "பிள்ளை",
-    "ஆம்பிளப்புள்ள": "ஆண் பிள்ளை",
-    "பொம்பிளப்புள்ள": "பெண் பிள்ளை",
-    "அண்ணாச்சி": "அண்ணா",
-    "அக்காச்சி": "அக்கா",
-    "தங்கச்சி": "தங்கை",
-    # difference words
-    "பொடியன்": "சிறுவன்",
-    "பொட்டை": "சிறுமி",
-    "பிறகு": "பின்பு",
-    "டக்கென்டு": "விரைவாக",
-    "கெதியா": "விரைவாக",
-    "கிறுகி": "திரும்பி",
-    "போயித்து வாறன்": "போய் வருகிறேன்",
-    "வருவாங்களா": "வருவார்களா",
-    # regular spokens
-    "சொல்லு": "சொல்",
-    "கேளு": "கேள்",
-    "சொல்லுங்க": "சொல்லுங்கள்",
-    "கேளுங்க": "கேளுங்கள்",
-    "நீங்கள்": "நீ",
-    "உன்": "உன்னுடைய",
-    # Portugeese formal words
-    "அலவாங்கு": "கடப்பாரை",
-    "ஆசுப்பத்திரி": "மருத்துவமனை",
-    "உரோதை": "சில்லு",
-    "கடுதாசி": "கடிதம்",
-    "கதிரை": "நாற்காலி",
-    "குசினி": "அடுக்களை",
-    "கோப்பை": "கிண்ணம்",
-    "சப்பாத்து": "காலணி",
-    "தாச்சி": "இரும்புச் சட்டி",
-    "துவாய்": "துவாலை",
-    "தவறணை": "மதுக்கடை",
-    "பீப்பா": "மரத்தாழி",
-    "யன்னல்": "சாளரம்",
-    "வாங்கு": "மரஇருக்கை",
-    # Dutch formal words
-    "இறாக்கை": "பற்சட்டம்",
-    "இலாட்சி": "இழுப்பறை",
-    "கந்தோர்": "பணிமனை",
-    "நொத்தாரிசு": "ஆவண எழுத்துபதிவாளர்",
-    # English formal words
-    "இஞ்சினியர்": "பொறியியலாளர்",
-    "சூப்பு": "ரசம்",
-    "செக்": "காசோலை",
-    "சேட்டு": "மேற்ச்சட்டை",
-    "மார்க்கட்டு": "சந்தை",
-    "விண்ணன்": "கெட்டிக்காரன்",
-    # Arabic formal words
-    "ஈமான்": "நம்பிக்கை",
-    "சுன்னத்து": "விருத்தசேதனம்",
-    "செய்த்தான்": "பிசாசு",
-    "மவுத்து": "இறப்பு",
-    "ஹலால்": "அங்கீகரிக்கப்பட்டது",
-    "கறாம்": "நிராகரிக்கப்பட்டது",
-    # Persian, Hindustanian and hindi formal words
-    "சுமார்": "கிட்டத்தட்ட",
-    "சிப்பாய்": "போர்வீரன்",
-    "சிபார்சு": "சிபாரிசு",
-    "ஜமீன்": "பணக்காரா்",
-    "அசல்": "மெய்யான",
-    "அந்தஸ்து": "கௌரவம்",
-    "ஆஜர்": "சமா்ப்பித்தல்",
-    "உசார்": "எச்சரிக்கை",
-    "அச்சா": "நல்ல",
-    # English words used in text conversations
-    "bcoz": "ஏனெனில்",
-    "bcuz": "ஏனெனில்",
-    "fav": "விருப்பமான",
-    "morning": "காலை வணக்கம்",
-    "gdeveng": "மாலை வணக்கம்",
-    "gdnyt": "இரவு வணக்கம்",
-    "gdnit": "இரவு வணக்கம்",
-    "plz": "தயவு செய்து",
-    "pls": "தயவு செய்து",
-    "thx": "நன்றி",
-    "thanx": "நன்றி",
-}
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
-    NORM_EXCEPTIONS[string] = norm
--- a/spacy/lang/th/init.py
+++ b/spacy/lang/th/init.py
@ -4,14 +4,12 @@ from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
-from .norm_exceptions import NORM_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS

-from ..norm_exceptions import BASE_NORMS
-from ...attrs import LANG, NORM
+from ...attrs import LANG
 from ...language import Language
 from ...tokens import Doc
-from ...util import DummyTokenizer, add_lookups
+from ...util import DummyTokenizer


 class ThaiTokenizer(DummyTokenizer):
@ -37,9 +35,6 @@ class ThaiDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda _text: "th"
-    lex_attr_getters[NORM] = add_lookups(
-        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
-    )
    tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
--- a/spacy/lang/th/norm_exceptions.py
+++ b/spacy/lang/th/norm_exceptions.py
@ -1,113 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
-_exc = {
-    # Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
-    "สนุ๊กเกอร์": "สนุกเกอร์",
-    "โน้ต": "โน้ต",
-    # Misspelled because of being lazy or hustle (สะกดผิดเพราะขี้เกียจพิมพ์ หรือเร่งรีบ)
-    "โทสับ": "โทรศัพท์",
-    "พุ่งนี้": "พรุ่งนี้",
-    # Strange (ให้ดูแปลกตา)
-    "ชะมะ": "ใช่ไหม",
-    "ชิมิ": "ใช่ไหม",
-    "ชะ": "ใช่ไหม",
-    "ช่ายมะ": "ใช่ไหม",
-    "ป่าว": "เปล่า",
-    "ป่ะ": "เปล่า",
-    "ปล่าว": "เปล่า",
-    "คัย": "ใคร",
-    "ไค": "ใคร",
-    "คราย": "ใคร",
-    "เตง": "ตัวเอง",
-    "ตะเอง": "ตัวเอง",
-    "รึ": "หรือ",
-    "เหรอ": "หรือ",
-    "หรา": "หรือ",
-    "หรอ": "หรือ",
-    "ชั้น": "ฉัน",
-    "ชั้ล": "ฉัน",
-    "ช้าน": "ฉัน",
-    "เทอ": "เธอ",
-    "เทอร์": "เธอ",
-    "เทอว์": "เธอ",
-    "แกร": "แก",
-    "ป๋ม": "ผม",
-    "บ่องตง": "บอกตรงๆ",
-    "ถ่ามตง": "ถามตรงๆ",
-    "ต่อมตง": "ตอบตรงๆ",
-    "เพิ่ล": "เพื่อน",
-    "จอบอ": "จอบอ",
-    "ดั้ย": "ได้",
-    "ขอบคุง": "ขอบคุณ",
-    "ยังงัย": "ยังไง",
-    "Inw": "เทพ",
-    "uou": "นอน",
-    "Lกรีeu": "เกรียน",
-    # Misspelled to express emotions (คำที่สะกดผิดเพื่อแสดงอารมณ์)
-    "เปงราย": "เป็นอะไร",
-    "เปนรัย": "เป็นอะไร",
-    "เปงรัย": "เป็นอะไร",
-    "เป็นอัลไล": "เป็นอะไร",
-    "ทามมาย": "ทำไม",
-    "ทามมัย": "ทำไม",
-    "จังรุย": "จังเลย",
-    "จังเยย": "จังเลย",
-    "จุงเบย": "จังเลย",
-    "ไม่รู้": "มะรุ",
-    "เฮ่ย": "เฮ้ย",
-    "เห้ย": "เฮ้ย",
-    "น่าร็อค": "น่ารัก",
-    "น่าร๊าก": "น่ารัก",
-    "ตั้ลล๊าก": "น่ารัก",
-    "คือร๊ะ": "คืออะไร",
-    "โอป่ะ": "โอเคหรือเปล่า",
-    "น่ามคาน": "น่ารำคาญ",
-    "น่ามสาร": "น่าสงสาร",
-    "วงวาร": "สงสาร",
-    "บับว่า": "แบบว่า",
-    "อัลไล": "อะไร",
-    "อิจ": "อิจฉา",
-    # Reduce rough words or Avoid to software filter (คำที่สะกดผิดเพื่อลดความหยาบของคำ หรืออาจใช้หลีกเลี่ยงการกรองคำหยาบของซอฟต์แวร์)
-    "กรู": "กู",
-    "กุ": "กู",
-    "กรุ": "กู",
-    "ตู": "กู",
-    "ตรู": "กู",
-    "มรึง": "มึง",
-    "เมิง": "มึง",
-    "มืง": "มึง",
-    "มุง": "มึง",
-    "สาด": "สัตว์",
-    "สัส": "สัตว์",
-    "สัก": "สัตว์",
-    "แสรด": "สัตว์",
-    "โคโตะ": "โคตร",
-    "โคด": "โคตร",
-    "โครต": "โคตร",
-    "โคตะระ": "โคตร",
-    "พ่อง": "พ่อมึง",
-    "แม่เมิง": "แม่มึง",
-    "เชี่ย": "เหี้ย",
-    # Imitate words (คำเลียนเสียง โดยส่วนใหญ่จะเพิ่มทัณฑฆาต หรือซ้ำตัวอักษร)
-    "แอร๊ยย": "อ๊าย",
-    "อร๊ายยย": "อ๊าย",
-    "มันส์": "มัน",
-    "วู๊วววววววว์": "วู้",
-    # Acronym (แบบคำย่อ)
-    "หมาลัย": "มหาวิทยาลัย",
-    "วิดวะ": "วิศวะ",
-    "สินสาด ": "ศิลปศาสตร์",
-    "สินกำ ": "ศิลปกรรมศาสตร์",
-    "เสารีย์ ": "อนุเสาวรีย์ชัยสมรภูมิ",
-    "เมกา ": "อเมริกา",
-    "มอไซค์ ": "มอเตอร์ไซค์",
-}
-
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
-    NORM_EXCEPTIONS[string] = norm
-    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/language.py
+++ b/spacy/language.py
@ -28,10 +28,11 @@ from .compat import izip, basestring_, is_python2, class_types
 from .gold import GoldParse
 from .scorer import Scorer
 from ._ml import link_vectors_to_models, create_default_optimizer
-from .attrs import IS_STOP, LANG
+from .attrs import IS_STOP, LANG, NORM
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .lang.punctuation import TOKENIZER_INFIXES
 from .lang.tokenizer_exceptions import TOKEN_MATCH
+from .lang.norm_exceptions import BASE_NORMS
 from .lang.tag_map import TAG_MAP
 from .tokens import Doc
 from .lang.lex_attrs import LEX_ATTRS, is_stop
@ -77,6 +78,9 @@ class BaseDefaults(object):
            lemmatizer=lemmatizer,
            lookups=lookups,
        )
+        vocab.lex_attr_getters[NORM] = util.add_lookups(
+            vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm")
+        )
        for tag_str, exc in cls.morph_rules.items():
            for orth_str, attrs in exc.items():
                vocab.morphology.add_special_case(tag_str, orth_str, attrs)
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,8 +1,8 @@
 from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
 from .attrs cimport attr_id_t
-from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG
+from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG

-from .structs cimport LexemeC, SerializedLexemeC
+from .structs cimport LexemeC
 from .strings cimport StringStore
 from .vocab cimport Vocab

@ -24,22 +24,6 @@ cdef class Lexeme:
        self.vocab = vocab
        self.orth = lex.orth

-    @staticmethod
-    cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
-        cdef SerializedLexemeC lex_data
-        buff = <const unsigned char*>&lex.flags
-        end = <const unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
-        for i in range(sizeof(lex_data.data)):
-            lex_data.data[i] = buff[i]
-        return lex_data
-
-    @staticmethod
-    cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
-        buff = <unsigned char*>&lex.flags
-        end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
-        for i in range(sizeof(lex_data.data)):
-            buff[i] = lex_data.data[i]
-
    @staticmethod
    cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
        if name < (sizeof(flags_t) * 8):
@ -56,8 +40,6 @@ cdef class Lexeme:
            lex.prefix = value
        elif name == SUFFIX:
            lex.suffix = value
-        elif name == CLUSTER:
-            lex.cluster = value
        elif name == LANG:
            lex.lang = value

@ -84,8 +66,6 @@ cdef class Lexeme:
            return lex.suffix
        elif feat_name == LENGTH:
            return lex.length
-        elif feat_name == CLUSTER:
-            return lex.cluster
        elif feat_name == LANG:
            return lex.lang
        else:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -17,7 +17,7 @@ from .typedefs cimport attr_t, flags_t
 from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-from .attrs cimport IS_CURRENCY, IS_OOV, PROB
+from .attrs cimport IS_CURRENCY

 from .attrs import intify_attrs
 from .errors import Errors, Warnings
@ -89,12 +89,11 @@ cdef class Lexeme:
        cdef attr_id_t attr
        attrs = intify_attrs(attrs)
        for attr, value in attrs.items():
-            if attr == PROB:
-                self.c.prob = value
-            elif attr == CLUSTER:
-                self.c.cluster = int(value)
-            elif isinstance(value, int) or isinstance(value, long):
-                Lexeme.set_struct_attr(self.c, attr, value)
+            # skip PROB, e.g. from lexemes.jsonl
+            if isinstance(value, float):
+                continue
+            elif isinstance(value, (int, long)):
+                 Lexeme.set_struct_attr(self.c, attr, value)
            else:
                Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))

@ -137,34 +136,6 @@ cdef class Lexeme:
        xp = get_array_module(vector)
        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))

-    def to_bytes(self):
-        lex_data = Lexeme.c_to_bytes(self.c)
-        start = <const char*>&self.c.flags
-        end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
-        if (end-start) != sizeof(lex_data.data):
-            raise ValueError(Errors.E072.format(length=end-start,
-                                                bad_length=sizeof(lex_data.data)))
-        byte_string = b"\0" * sizeof(lex_data.data)
-        byte_chars = <char*>byte_string
-        for i in range(sizeof(lex_data.data)):
-            byte_chars[i] = lex_data.data[i]
-        if len(byte_string) != sizeof(lex_data.data):
-            raise ValueError(Errors.E072.format(length=len(byte_string),
-                                                bad_length=sizeof(lex_data.data)))
-        return byte_string
-
-    def from_bytes(self, bytes byte_string):
-        # This method doesn't really have a use-case --- wrote it for testing.
-        # Possibly delete? It puts the Lexeme out of synch with the vocab.
-        cdef SerializedLexemeC lex_data
-        if len(byte_string) != sizeof(lex_data.data):
-            raise ValueError(Errors.E072.format(length=len(byte_string),
-                                                bad_length=sizeof(lex_data.data)))
-        for i in range(len(byte_string)):
-            lex_data.data[i] = byte_string[i]
-        Lexeme.c_from_bytes(self.c, lex_data)
-        self.orth = self.c.orth
-
    @property
    def has_vector(self):
        """RETURNS (bool): Whether a word vector is associated with the object.
@ -208,10 +179,14 @@ cdef class Lexeme:
        """RETURNS (float): A scalar value indicating the positivity or
            negativity of the lexeme."""
        def __get__(self):
-            return self.c.sentiment
+            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
+            return sentiment_table.get(self.c.orth, 0.0)

-        def __set__(self, float sentiment):
-            self.c.sentiment = sentiment
+        def __set__(self, float x):
+            if "lexeme_sentiment" not in self.vocab.lookups:
+                self.vocab.lookups.add_table("lexeme_sentiment")
+            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
+            sentiment_table[self.c.orth] = x

    @property
    def orth_(self):
@ -238,9 +213,13 @@ cdef class Lexeme:
            lexeme text.
        """
        def __get__(self):
-                return self.c.norm
+            return self.c.norm

        def __set__(self, attr_t x):
+            if "lexeme_norm" not in self.vocab.lookups:
+                self.vocab.lookups.add_table("lexeme_norm")
+            norm_table = self.vocab.lookups.get_table("lexeme_norm")
+            norm_table[self.c.orth] = self.vocab.strings[x]
            self.c.norm = x

    property shape:
@ -276,10 +255,12 @@ cdef class Lexeme:
    property cluster:
        """RETURNS (int): Brown cluster ID."""
        def __get__(self):
-            return self.c.cluster
+            cluster_table = self.vocab.load_extra_lookups("lexeme_cluster")
+            return cluster_table.get(self.c.orth, 0)

-        def __set__(self, attr_t x):
-            self.c.cluster = x
+        def __set__(self, int x):
+            cluster_table = self.vocab.load_extra_lookups("lexeme_cluster")
+            cluster_table[self.c.orth] = x

    property lang:
        """RETURNS (uint64): Language of the parent vocabulary."""
@ -293,10 +274,14 @@ cdef class Lexeme:
        """RETURNS (float): Smoothed log probability estimate of the lexeme's
            type."""
        def __get__(self):
-            return self.c.prob
+            prob_table = self.vocab.load_extra_lookups("lexeme_prob")
+            settings_table = self.vocab.load_extra_lookups("lexeme_settings")
+            default_oov_prob = settings_table.get("oov_prob", -20.0)
+            return prob_table.get(self.c.orth, default_oov_prob)

        def __set__(self, float x):
-            self.c.prob = x
+            prob_table = self.vocab.load_extra_lookups("lexeme_prob")
+            prob_table[self.c.orth] = x

    property lower_:
        """RETURNS (unicode): Lowercase form of the word."""
@ -314,7 +299,7 @@ cdef class Lexeme:
            return self.vocab.strings[self.c.norm]

        def __set__(self, unicode x):
-            self.c.norm = self.vocab.strings.add(x)
+            self.norm = self.vocab.strings.add(x)

    property shape_:
        """RETURNS (unicode): Transform of the word's string, to show
@ -362,13 +347,10 @@ cdef class Lexeme:
        def __set__(self, flags_t x):
            self.c.flags = x

-    property is_oov:
+    @property
+    def is_oov(self):
        """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
-        def __get__(self):
-            return Lexeme.c_check_flag(self.c, IS_OOV)
-
-        def __set__(self, attr_t x):
-            Lexeme.c_set_flag(self.c, IS_OOV, x)
+        return self.orth in self.vocab.vectors

    property is_stop:
        """RETURNS (bool): Whether the lexeme is a stop word."""
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -124,7 +124,7 @@ class Lookups(object):
            self._tables[key].update(value)
        return self

-    def to_disk(self, path, **kwargs):
+    def to_disk(self, path, filename="lookups.bin", **kwargs):
        """Save the lookups to a directory as lookups.bin. Expects a path to a
        directory, which will be created if it doesn't exist.

@ -136,11 +136,11 @@ class Lookups(object):
            path = ensure_path(path)
            if not path.exists():
                path.mkdir()
-            filepath = path / "lookups.bin"
+            filepath = path / filename
            with filepath.open("wb") as file_:
                file_.write(self.to_bytes())

-    def from_disk(self, path, **kwargs):
+    def from_disk(self, path, filename="lookups.bin", **kwargs):
        """Load lookups from a directory containing a lookups.bin. Will skip
        loading if the file doesn't exist.

@ -150,7 +150,7 @@ class Lookups(object):
        DOCS: https://spacy.io/api/lookups#from_disk
        """
        path = ensure_path(path)
-        filepath = path / "lookups.bin"
+        filepath = path / filename
        if filepath.exists():
            with filepath.open("rb") as file_:
                data = file_.read()
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -23,29 +23,6 @@ cdef struct LexemeC:
    attr_t prefix
    attr_t suffix

-    attr_t cluster
-
-    float prob
-    float sentiment
-
-
-cdef struct SerializedLexemeC:
-    unsigned char[8 + 8*10 + 4 + 4] data
-    #    sizeof(flags_t)  # flags
-    #    + sizeof(attr_t) # lang
-    #    + sizeof(attr_t) # id
-    #    + sizeof(attr_t) # length
-    #    + sizeof(attr_t) # orth
-    #    + sizeof(attr_t) # lower
-    #    + sizeof(attr_t) # norm
-    #    + sizeof(attr_t) # shape
-    #    + sizeof(attr_t) # prefix
-    #    + sizeof(attr_t) # suffix
-    #    + sizeof(attr_t) # cluster
-    #    + sizeof(float)  # prob
-    #    + sizeof(float)  # cluster
-    #    + sizeof(float) # l2_norm
-

 cdef struct SpanC:
    hash_t id
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -12,7 +12,7 @@ cdef enum symbol_t:
    LIKE_NUM
    LIKE_EMAIL
    IS_STOP
-    IS_OOV
+    IS_OOV_DEPRECATED
    IS_BRACKET
    IS_QUOTE
    IS_LEFT_PUNCT
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -17,7 +17,7 @@ IDS = {
    "LIKE_NUM": LIKE_NUM,
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
-    "IS_OOV": IS_OOV,
+    "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
    "IS_BRACKET": IS_BRACKET,
    "IS_QUOTE": IS_QUOTE,
    "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
--- a/spacy/tests/lang/da/test_exceptions.py
+++ b/spacy/tests/lang/da/test_exceptions.py
@ -37,14 +37,6 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
    assert tokens[7].text == "."


-@pytest.mark.parametrize(
-    "text,norm", [("akvarium", "akvarie"), ("bedstemoder", "bedstemor")]
-)
-def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
-    tokens = da_tokenizer(text)
-    assert tokens[0].norm_ == norm
-
-
@pytest.mark.parametrize(
    "text,n_tokens",
    [
--- a/spacy/tests/lang/de/test_exceptions.py
+++ b/spacy/tests/lang/de/test_exceptions.py
@ -22,17 +22,3 @@ def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
    assert len(tokens) == 6
    assert tokens[2].text == "z.Zt."
    assert tokens[2].lemma_ == "zur Zeit"
-
-
-@pytest.mark.parametrize(
-    "text,norms", [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])]
-)
-def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
-    tokens = de_tokenizer(text)
-    assert [token.norm_ for token in tokens] == norms
-
-
-@pytest.mark.parametrize("text,norm", [("daß", "dass")])
-def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
-    tokens = de_tokenizer(text)
-    assert tokens[0].norm_ == norm
--- a/spacy/tests/lang/en/test_exceptions.py
+++ b/spacy/tests/lang/en/test_exceptions.py
@ -118,6 +118,7 @@ def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
    assert [token.norm_ for token in tokens] == norms


+@pytest.mark.skip
@pytest.mark.parametrize(
    "text,norm", [("radicalised", "radicalized"), ("cuz", "because")]
 )
--- a/spacy/tests/lang/lb/test_exceptions.py
+++ b/spacy/tests/lang/lb/test_exceptions.py
@ -22,9 +22,3 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
    assert len(tokens) == 9
    assert tokens[1].text == "'t"
    assert tokens[1].lemma_ == "et"
-
-
-@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
-def test_lb_norm_exceptions(lb_tokenizer, text, norm):
-    tokens = lb_tokenizer(text)
-    assert tokens[0].norm_ == norm
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 import pytest
+import pickle
 from spacy.vocab import Vocab
 from spacy.strings import StringStore

@ -36,8 +37,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
    assert vocab1.to_bytes() == vocab1_b
    new_vocab1 = Vocab().from_bytes(vocab1_b)
    assert new_vocab1.to_bytes() == vocab1_b
-    assert len(new_vocab1) == len(strings1)
-    assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
+    assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP
+    assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])


@pytest.mark.parametrize("strings1,strings2", test_strings)
@ -51,12 +52,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
        vocab2.to_disk(file_path2)
        vocab1_d = Vocab().from_disk(file_path1)
        vocab2_d = Vocab().from_disk(file_path2)
-        assert list(vocab1_d) == list(vocab1)
-        assert list(vocab2_d) == list(vocab2)
+        # check strings rather than lexemes, which are only reloaded on demand
+        assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
+        assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
        if strings1 == strings2:
-            assert list(vocab1_d) == list(vocab2_d)
+            assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"]
        else:
-            assert list(vocab1_d) != list(vocab2_d)
+            assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"]


@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@ -76,7 +78,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
    vocab = Vocab(strings=strings)
    length = len(vocab)
    vocab.from_bytes(vocab.to_bytes())
-    assert len(vocab) == length
+    assert len(vocab.strings) == len(strings) + 1 # adds _SP


@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@ -127,3 +129,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
            assert list(sstore1_d) == list(sstore2_d)
        else:
            assert list(sstore1_d) != list(sstore2_d)
+
+@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
+def test_pickle_vocab(strings, lex_attr):
+    vocab = Vocab(strings=strings)
+    vocab[strings[0]].norm_ = lex_attr
+    vocab_pickled = pickle.dumps(vocab)
+    vocab_unpickled = pickle.loads(vocab_pickled)
+    assert vocab.to_bytes() == vocab_unpickled.to_bytes()
--- a/spacy/tests/test_lemmatizer.py
+++ b/spacy/tests/test_lemmatizer.py
@ -26,7 +26,7 @@ def test_lemmatizer_reflects_lookups_changes():
    nlp_bytes = nlp.to_bytes()
    new_nlp.from_bytes(nlp_bytes)
    # Make sure we have the previously saved lookup table
-    assert len(new_nlp.vocab.lookups) == 1
+    assert "lemma_lookup" in new_nlp.vocab.lookups
    assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2
    assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world"
    assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar"
--- a/spacy/tests/vocab_vectors/test_lexeme.py
+++ b/spacy/tests/vocab_vectors/test_lexeme.py
@ -60,19 +60,6 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab):
    assert en_vocab["dogs"].check_flag(is_len4) is True


-def test_lexeme_bytes_roundtrip(en_vocab):
-    one = en_vocab["one"]
-    alpha = en_vocab["alpha"]
-    assert one.orth != alpha.orth
-    assert one.lower != alpha.lower
-    alpha.from_bytes(one.to_bytes())
-
-    assert one.orth_ == alpha.orth_
-    assert one.orth == alpha.orth
-    assert one.lower == alpha.lower
-    assert one.lower_ == alpha.lower_
-
-
 def test_vocab_lexeme_oov_rank(en_vocab):
    """Test that default rank is OOV_RANK."""
    lex = en_vocab["word"]
--- a/spacy/tests/vocab_vectors/test_lookups.py
+++ b/spacy/tests/vocab_vectors/test_lookups.py
@ -119,12 +119,11 @@ def test_lookups_to_from_bytes_via_vocab():
    table_name = "test"
    vocab = Vocab()
    vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
-    assert len(vocab.lookups) == 1
    assert table_name in vocab.lookups
    vocab_bytes = vocab.to_bytes()
    new_vocab = Vocab()
    new_vocab.from_bytes(vocab_bytes)
-    assert len(new_vocab.lookups) == 1
+    assert len(new_vocab.lookups) == len(vocab.lookups)
    assert table_name in new_vocab.lookups
    table = new_vocab.lookups.get_table(table_name)
    assert len(table) == 2
@ -137,13 +136,12 @@ def test_lookups_to_from_disk_via_vocab():
    table_name = "test"
    vocab = Vocab()
    vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
-    assert len(vocab.lookups) == 1
    assert table_name in vocab.lookups
    with make_tempdir() as tmpdir:
        vocab.to_disk(tmpdir)
        new_vocab = Vocab()
        new_vocab.from_disk(tmpdir)
-    assert len(new_vocab.lookups) == 1
+    assert len(new_vocab.lookups) == len(vocab.lookups)
    assert table_name in new_vocab.lookups
    table = new_vocab.lookups.get_table(table_name)
    assert len(table) == 2
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@ -329,3 +329,15 @@ def test_vocab_prune_vectors():
    neighbour, similarity = list(remap.values())[0]
    assert neighbour == "cat", remap
    assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
+
+
+def test_vector_is_oov():
+    vocab = Vocab(vectors_name="test_vocab_is_oov")
+    data = numpy.ndarray((5, 3), dtype="f")
+    data[0] = 1.0
+    data[1] = 2.0
+    vocab.set_vector("cat", data[0])
+    vocab.set_vector("dog", data[1])
+    assert vocab["cat"].is_oov is True
+    assert vocab["dog"].is_oov is True
+    assert vocab["hamster"].is_oov is False
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -17,7 +17,7 @@ from ..typedefs cimport hash_t
 from ..lexeme cimport Lexeme
 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
+from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
 from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
 from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
 from ..symbols cimport conj
@ -259,7 +259,7 @@ cdef class Token:
    @property
    def prob(self):
        """RETURNS (float): Smoothed log probability estimate of token type."""
-        return self.c.lex.prob
+        return self.vocab[self.c.lex.orth].prob

    @property
    def sentiment(self):
@ -267,7 +267,7 @@ cdef class Token:
            negativity of the token."""
        if "sentiment" in self.doc.user_token_hooks:
            return self.doc.user_token_hooks["sentiment"](self)
-        return self.c.lex.sentiment
+        return self.vocab[self.c.lex.orth].sentiment

    @property
    def lang(self):
@ -286,7 +286,7 @@ cdef class Token:
    @property
    def cluster(self):
        """RETURNS (int): Brown cluster ID."""
-        return self.c.lex.cluster
+        return self.vocab[self.c.lex.orth].cluster

    @property
    def orth(self):
@ -923,7 +923,7 @@ cdef class Token:
    @property
    def is_oov(self):
        """RETURNS (bool): Whether the token is out-of-vocabulary."""
-        return Lexeme.c_check_flag(self.c.lex, IS_OOV)
+        return self.c.lex.orth in self.vocab.vectors

    @property
    def is_stop(self):
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -30,6 +30,7 @@ cdef class Vocab:
    cpdef public Morphology morphology
    cpdef public object vectors
    cpdef public object lookups
+    cpdef public object lookups_extra
    cdef readonly int length
    cdef public object data_dir
    cdef public object lex_attr_getters
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -11,8 +11,7 @@ from .lexeme cimport EMPTY_LEXEME, OOV_RANK
 from .lexeme cimport Lexeme
 from .typedefs cimport attr_t
 from .tokens.token cimport Token
-from .attrs cimport PROB, LANG, ORTH, TAG, POS
-from .structs cimport SerializedLexemeC
+from .attrs cimport LANG, ORTH, TAG, POS

 from .compat import copy_reg, basestring_
 from .errors import Errors
@ -22,6 +21,8 @@ from .vectors import Vectors
 from ._ml import link_vectors_to_models
 from .lookups import Lookups
 from . import util
+from .lang.norm_exceptions import BASE_NORMS
+from .lang.lex_attrs import LEX_ATTRS


 cdef class Vocab:
@ -32,8 +33,8 @@ cdef class Vocab:
    DOCS: https://spacy.io/api/vocab
    """
    def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
-                 strings=tuple(), lookups=None, oov_prob=-20., vectors_name=None,
-                 **deprecated_kwargs):
+                 strings=tuple(), lookups=None, lookups_extra=None,
+                 oov_prob=-20., vectors_name=None, **deprecated_kwargs):
        """Create the vocabulary.

        lex_attr_getters (dict): A dictionary mapping attribute IDs to
@ -44,6 +45,7 @@ cdef class Vocab:
        strings (StringStore): StringStore that maps strings to integers, and
            vice versa.
        lookups (Lookups): Container for large lookup tables and dictionaries.
+        lookups_extra (Lookups): Container for optional lookup tables and dictionaries.
        name (unicode): Optional name to identify the vectors table.
        RETURNS (Vocab): The newly constructed object.
        """
@ -51,8 +53,12 @@ cdef class Vocab:
        tag_map = tag_map if tag_map is not None else {}
        if lookups in (None, True, False):
            lookups = Lookups()
+        if "lexeme_norm" not in lookups:
+            lookups.add_table("lexeme_norm")
        if lemmatizer in (None, True, False):
            lemmatizer = Lemmatizer(lookups)
+        if lookups_extra in (None, True, False):
+            lookups_extra = Lookups()
        self.cfg = {'oov_prob': oov_prob}
        self.mem = Pool()
        self._by_orth = PreshMap()
@ -65,6 +71,7 @@ cdef class Vocab:
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
        self.vectors = Vectors(name=vectors_name)
        self.lookups = lookups
+        self.lookups_extra = lookups_extra

    @property
    def lang(self):
@ -173,9 +180,7 @@ cdef class Vocab:
                value = func(string)
                if isinstance(value, unicode):
                    value = self.strings.add(value)
-                if attr == PROB:
-                    lex.prob = value
-                elif value is not None:
+                if value is not None:
                    Lexeme.set_struct_attr(lex, attr, value)
        if not is_oov:
            self._add_lex_to_vocab(lex.orth, lex)
@ -435,17 +440,16 @@ cdef class Vocab:
        path = util.ensure_path(path)
        if not path.exists():
            path.mkdir()
-        setters = ["strings", "lexemes", "vectors"]
+        setters = ["strings", "vectors"]
        exclude = util.get_serialization_exclude(setters, exclude, kwargs)
        if "strings" not in exclude:
            self.strings.to_disk(path / "strings.json")
-        if "lexemes" not in exclude:
-            with (path / "lexemes.bin").open("wb") as file_:
-                file_.write(self.lexemes_to_bytes())
        if "vectors" not in "exclude" and self.vectors is not None:
            self.vectors.to_disk(path)
        if "lookups" not in "exclude" and self.lookups is not None:
            self.lookups.to_disk(path)
+        if "lookups_extra" not in "exclude" and self.lookups_extra is not None:
+            self.lookups_extra.to_disk(path, filename="lookups_extra.bin")

    def from_disk(self, path, exclude=tuple(), **kwargs):
        """Loads state from a directory. Modifies the object in place and
@ -458,13 +462,10 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#to_disk
        """
        path = util.ensure_path(path)
-        getters = ["strings", "lexemes", "vectors"]
+        getters = ["strings", "vectors"]
        exclude = util.get_serialization_exclude(getters, exclude, kwargs)
        if "strings" not in exclude:
            self.strings.from_disk(path / "strings.json")  # TODO: add exclude?
-        if "lexemes" not in exclude:
-            with (path / "lexemes.bin").open("rb") as file_:
-                self.lexemes_from_bytes(file_.read())
        if "vectors" not in exclude:
            if self.vectors is not None:
                self.vectors.from_disk(path, exclude=["strings"])
@ -472,6 +473,14 @@ cdef class Vocab:
                link_vectors_to_models(self)
        if "lookups" not in exclude:
            self.lookups.from_disk(path)
+        if "lookups_extra" not in exclude:
+            self.lookups_extra.from_disk(path, filename="lookups_extra.bin")
+        if "lexeme_norm" in self.lookups:
+            self.lex_attr_getters[NORM] = util.add_lookups(
+                self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
+            )
+        self.length = 0
+        self._by_orth = PreshMap()
        return self

    def to_bytes(self, exclude=tuple(), **kwargs):
@ -490,9 +499,9 @@ cdef class Vocab:

        getters = OrderedDict((
            ("strings", lambda: self.strings.to_bytes()),
-            ("lexemes", lambda: self.lexemes_to_bytes()),
            ("vectors", deserialize_vectors),
-            ("lookups", lambda: self.lookups.to_bytes())
+            ("lookups", lambda: self.lookups.to_bytes()),
+            ("lookups_extra", lambda: self.lookups_extra.to_bytes())
        ))
        exclude = util.get_serialization_exclude(getters, exclude, kwargs)
        return util.to_bytes(getters, exclude)
@ -514,99 +523,62 @@ cdef class Vocab:

        setters = OrderedDict((
            ("strings", lambda b: self.strings.from_bytes(b)),
-            ("lexemes", lambda b: self.lexemes_from_bytes(b)),
            ("vectors", lambda b: serialize_vectors(b)),
-            ("lookups", lambda b: self.lookups.from_bytes(b))
+            ("lookups", lambda b: self.lookups.from_bytes(b)),
+            ("lookups_extra", lambda b: self.lookups_extra.from_bytes(b))
        ))
        exclude = util.get_serialization_exclude(setters, exclude, kwargs)
        util.from_bytes(bytes_data, setters, exclude)
+        if "lexeme_norm" in self.lookups:
+            self.lex_attr_getters[NORM] = util.add_lookups(
+                self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
+            )
+        self.length = 0
+        self._by_orth = PreshMap()
        if self.vectors.name is not None:
            link_vectors_to_models(self)
        return self

-    def lexemes_to_bytes(self):
-        cdef hash_t key
-        cdef size_t addr
-        cdef LexemeC* lexeme = NULL
-        cdef SerializedLexemeC lex_data
-        cdef int size = 0
-        for key, addr in self._by_orth.items():
-            if addr == 0:
-                continue
-            size += sizeof(lex_data.data)
-        byte_string = b"\0" * size
-        byte_ptr = <unsigned char*>byte_string
-        cdef int j
-        cdef int i = 0
-        for key, addr in self._by_orth.items():
-            if addr == 0:
-                continue
-            lexeme = <LexemeC*>addr
-            lex_data = Lexeme.c_to_bytes(lexeme)
-            for j in range(sizeof(lex_data.data)):
-                byte_ptr[i] = lex_data.data[j]
-                i += 1
-        return byte_string
-
-    def lexemes_from_bytes(self, bytes bytes_data):
-        """Load the binary vocabulary data from the given string."""
-        cdef LexemeC* lexeme
-        cdef hash_t key
-        cdef unicode py_str
-        cdef int i = 0
-        cdef int j = 0
-        cdef SerializedLexemeC lex_data
-        chunk_size = sizeof(lex_data.data)
-        cdef void* ptr
-        cdef unsigned char* bytes_ptr = bytes_data
-        for i in range(0, len(bytes_data), chunk_size):
-            lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
-            for j in range(sizeof(lex_data.data)):
-                lex_data.data[j] = bytes_ptr[i+j]
-            Lexeme.c_from_bytes(lexeme, lex_data)
-            prev_entry = self._by_orth.get(lexeme.orth)
-            if prev_entry != NULL:
-                memcpy(prev_entry, lexeme, sizeof(LexemeC))
-                continue
-            ptr = self.strings._map.get(lexeme.orth)
-            if ptr == NULL:
-                continue
-            py_str = self.strings[lexeme.orth]
-            if self.strings[py_str] != lexeme.orth:
-                raise ValueError(Errors.E086.format(string=py_str,
-                                                    orth_id=lexeme.orth,
-                                                    hash_id=self.strings[py_str]))
-            self._by_orth.set(lexeme.orth, lexeme)
-            self.length += 1
-
    def _reset_cache(self, keys, strings):
        # I'm not sure this made sense. Disable it for now.
        raise NotImplementedError


+    def load_extra_lookups(self, table_name):
+        if table_name not in self.lookups_extra:
+            if self.lang + "_extra" in util.registry.lookups:
+                tables = util.registry.lookups.get(self.lang + "_extra")
+                for name, filename in tables.items():
+                    if table_name == name:
+                        data = util.load_language_data(filename)
+                        self.lookups_extra.add_table(name, data)
+        if table_name not in self.lookups_extra:
+            self.lookups_extra.add_table(table_name)
+        return self.lookups_extra.get_table(table_name)
+
+
 def pickle_vocab(vocab):
    sstore = vocab.strings
    vectors = vocab.vectors
    morph = vocab.morphology
-    length = vocab.length
    data_dir = vocab.data_dir
    lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
-    lexemes_data = vocab.lexemes_to_bytes()
+    lookups = vocab.lookups
+    lookups_extra = vocab.lookups_extra
    return (unpickle_vocab,
-            (sstore, vectors, morph, data_dir, lex_attr_getters, lexemes_data, length))
+            (sstore, vectors, morph, data_dir, lex_attr_getters, lookups, lookups_extra))


 def unpickle_vocab(sstore, vectors, morphology, data_dir,
-                   lex_attr_getters, bytes lexemes_data, int length):
+                   lex_attr_getters, lookups, lookups_extra):
    cdef Vocab vocab = Vocab()
-    vocab.length = length
    vocab.vectors = vectors
    vocab.strings = sstore
    vocab.morphology = morphology
    vocab.data_dir = data_dir
    vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
-    vocab.lexemes_from_bytes(lexemes_data)
-    vocab.length = length
+    vocab.lookups = lookups
+    vocab.lookups_extra = lookups_extra
    return vocab