Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-07-12 09:12:21 +03:00 · 2017-06-03 14:33:51 -05:00 · 2017-06-03 14:33:51 -05:00 · 468ca6c760
commit 468ca6c760
parent c647a0d33e e47eef5e03
28 changed files with 2275 additions and 324 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ pathlib
 numpy>=1.7
 cymem>=1.30,<1.32
 preshed>=1.0.0,<2.0.0
-thinc>=6.7.1,<6.8.0
+thinc>=6.7.2,<6.8.0
 murmurhash>=0.28,<0.29
 plac<1.0.0,>=0.9.6
 six
--- a/setup.py
+++ b/setup.py
@ -191,7 +191,7 @@ def setup_package():
                'murmurhash>=0.28,<0.29',
                'cymem>=1.30,<1.32',
                'preshed>=1.0.0,<2.0.0',
-                'thinc>=6.7.1,<6.8.0',
+                'thinc>=6.7.2,<6.8.0',
                'plac<1.0.0,>=0.9.6',
                'pip>=9.0.0,<10.0.0',
                'six',
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -28,15 +28,17 @@ from .. import displacy
    n_iter=("number of iterations", "option", "n", int),
    n_sents=("number of sentences", "option", "ns", int),
    use_gpu=("Use GPU", "flag", "G", bool),
    resume=("Whether to resume training", "flag", "R", bool),
    no_tagger=("Don't train tagger", "flag", "T", bool),
    no_parser=("Don't train parser", "flag", "P", bool),
    no_entities=("Don't train NER", "flag", "N", bool)
 )
 def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
-          use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
+          use_gpu=False, resume=False, no_tagger=False, no_parser=False, no_entities=False):
    """
    Train a model. Expects data in spaCy's JSON format.
    """
    util.set_env_log(True)
    n_sents = n_sents or None
    output_path = util.ensure_path(output_dir)
    train_path = util.ensure_path(train_data)
@ -66,7 +68,11 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                                   util.env_opt('batch_to', 64),
                                   util.env_opt('batch_compound', 1.001))
-    nlp = lang_class(pipeline=pipeline)
+    if resume:
        prints(output_path / 'model19.pickle', title="Resuming training")
        nlp = dill.load((output_path / 'model19.pickle').open('rb'))
    else:
        nlp = lang_class(pipeline=pipeline)
    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
    n_train_docs = corpus.count_train()
@ -75,6 +81,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
    print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
    try:
        for i in range(n_iter):
            if resume:
                i += 20
            with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
                train_docs = corpus.train_docs(nlp, projectivize=True,
                                               gold_preproc=False, max_length=0)
@ -86,14 +94,18 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                    pbar.update(len(docs))
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ('model%d' % i)
                nlp.to_disk(epoch_model_path)
                with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
                    dill.dump(nlp, file_, -1)
-                with (output_path / ('model%d.bin' % i)).open('wb') as file_:
+                nlp_loaded = lang_class(pipeline=pipeline)
-                    file_.write(nlp.to_bytes())
+                nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
-                with (output_path / ('model%d.bin' % i)).open('rb') as file_:
+                scorer = nlp_loaded.evaluate(
-                    nlp_loaded = lang_class(pipeline=pipeline)
+                            corpus.dev_docs(
-                    nlp_loaded.from_bytes(file_.read())
+                                nlp_loaded,
-                scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False))
+                                gold_preproc=False))
                util.set_env_log(True)
            print_progress(i, losses, scorer.scores)
    finally:
        print("Saving model...")
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -56,7 +56,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
    httpd = simple_server.make_server('0.0.0.0', port, app)
    prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
-    httpd.serve_forever()
+    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
        prints("Shutting down server on port %d." % port)
    finally:
        httpd.server_close()
 def app(environ, start_response):
@ -65,12 +70,13 @@ def app(environ, start_response):
    return [res]
-def parse_deps(doc, options={}):
+def parse_deps(orig_doc, options={}):
    """Generate dependency parse in {'words': [], 'arcs': []} format.
    doc (Doc): Document do parse.
    RETURNS (dict): Generated dependency parse keyed by words and arcs.
    """
    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
    if options.get('collapse_punct', True):
        spans = []
        for word in doc[:-1]:
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -18,12 +18,11 @@ class DependencyRenderer(object):
                        offset_x, color, bg, font)
        """
        self.compact = options.get('compact', False)
        distance, arrow_width = (85, 8) if self.compact else (175, 10)
        self.word_spacing = options.get('word_spacing', 45)
-        self.arrow_spacing = options.get('arrow_spacing', 20)
+        self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
-        self.arrow_width = options.get('arrow_width', arrow_width)
+        self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
        self.arrow_stroke = options.get('arrow_stroke', 2)
-        self.distance = options.get('distance', distance)
+        self.distance = options.get('distance', 150 if self.compact else 175)
        self.offset_x = options.get('offset_x', 50)
        self.color = options.get('color', '#000000')
        self.bg = options.get('bg', '#ffffff')
@ -99,6 +98,8 @@ class DependencyRenderer(object):
        x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
                 -self.arrow_spacing*(self.highest_level-level)/4)
        y_curve = self.offset_y-level*self.distance/2
        if self.compact:
            y_curve = self.offset_y-level*self.distance/6
        if y_curve == 0 and len(self.levels) > 5:
            y_curve = -self.distance
        arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@ -21,7 +21,7 @@ TPL_DEP_WORDS = """
 TPL_DEP_ARCS = """
 <g class="displacy-arrow">
    <path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
-    <text dy="1.25em" style="font-size: 0.8em">
+    <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
        <textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath>
    </text>
    <path class="displacy-arrowhead" d="{head}" fill="currentColor"/>
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -212,7 +212,7 @@ class GoldCorpus(object):
    def dev_docs(self, nlp, gold_preproc=False):
        gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
-        gold_docs = nlp.preprocess_gold(gold_docs)
+        #gold_docs = nlp.preprocess_gold(gold_docs)
        yield from gold_docs
    @classmethod
@ -227,7 +227,7 @@ class GoldCorpus(object):
                                  gold_preproc)
            golds = cls._make_golds(docs, paragraph_tuples)
            for doc, gold in zip(docs, golds):
-                if not max_length or len(doc) < max_length:
+                if (not max_length) or len(doc) < max_length:
                    yield doc, gold
    @classmethod
@ -235,17 +235,17 @@ class GoldCorpus(object):
        if raw_text is not None:
            return [nlp.make_doc(raw_text)]
        else:
-            return [Doc(nlp.vocab, words=sent_tuples[0][1])
+            return [Doc(nlp.vocab, words=sent_tuples[1])
-                for sent_tuples in paragraph_tuples]
+                for (sent_tuples, brackets) in paragraph_tuples]
    @classmethod
    def _make_golds(cls, docs, paragraph_tuples):
        assert len(docs) == len(paragraph_tuples)
        if len(docs) == 1:
-            return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0])
+            return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
                    for sent_tuples in paragraph_tuples]
        else:
-            return [GoldParse.from_annot_tuples(doc, sent_tuples[0])
+            return [GoldParse.from_annot_tuples(doc, sent_tuples)
-                    for doc, sent_tuples in zip(docs, paragraph_tuples)]
+                    for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
    @staticmethod
    def walk_corpus(path):
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -2,21 +2,25 @@
 from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .norm_exceptions import NORM_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
 from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
+from ...attrs import LANG, NORM
-from ...util import update_exc
+from ...util import update_exc, add_lookups
 class GermanDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'de'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
                                         BASE_NORMS, NORM_EXCEPTIONS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = dict(TAG_MAP)
--- a/spacy/lang/de/norm_exceptions.py
+++ b/spacy/lang/de/norm_exceptions.py
@ -0,0 +1,17 @@
 # coding: utf8
 from __future__ import unicode_literals
 # Here we only want to include the absolute most common words. Otherwise,
 # this list would get impossibly long for German – especially considering the
 # old vs. new spelling rules, and all possible cases.
 _exc = {
    "daß": "dass"
 }
 NORM_EXCEPTIONS = {}
 for string, norm in _exc.items():
    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@ -8,7 +8,7 @@ from ...deprecated import PRON_LEMMA
 _exc = {
    "auf'm": [
        {ORTH: "auf", LEMMA: "auf"},
-        {ORTH: "'m", LEMMA: "der", NORM: "dem" }],
+        {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
    "du's": [
        {ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"},
@ -53,97 +53,97 @@ _exc = {
 for exc_data in [
-    {ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"},
+    {ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
-    {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"},
+    {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
-    {ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"},
+    {ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
-    {ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"},
+    {ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
    {ORTH: "'n", LEMMA: "ein", NORM: "ein"},
    {ORTH: "'ne", LEMMA: "eine", NORM: "eine"},
    {ORTH: "'nen", LEMMA: "ein", NORM: "einen"},
    {ORTH: "'nem", LEMMA: "ein", NORM: "einem"},
-    {ORTH: "Abb.", LEMMA: "Abbildung"},
+    {ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"},
-    {ORTH: "Abk.", LEMMA: "Abkürzung"},
+    {ORTH: "Abk.", LEMMA: "Abkürzung", NORM: "Abkürzung"},
-    {ORTH: "Abt.", LEMMA: "Abteilung"},
+    {ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"},
-    {ORTH: "Apr.", LEMMA: "April"},
+    {ORTH: "Apr.", LEMMA: "April", NORM: "April"},
-    {ORTH: "Aug.", LEMMA: "August"},
+    {ORTH: "Aug.", LEMMA: "August", NORM: "August"},
-    {ORTH: "Bd.", LEMMA: "Band"},
+    {ORTH: "Bd.", LEMMA: "Band", NORM: "Band"},
-    {ORTH: "Betr.", LEMMA: "Betreff"},
+    {ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"},
-    {ORTH: "Bf.", LEMMA: "Bahnhof"},
+    {ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
-    {ORTH: "Bhf.", LEMMA: "Bahnhof"},
+    {ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
-    {ORTH: "Bsp.", LEMMA: "Beispiel"},
+    {ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"},
-    {ORTH: "Dez.", LEMMA: "Dezember"},
+    {ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"},
-    {ORTH: "Di.", LEMMA: "Dienstag"},
+    {ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"},
-    {ORTH: "Do.", LEMMA: "Donnerstag"},
+    {ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"},
-    {ORTH: "Fa.", LEMMA: "Firma"},
+    {ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"},
-    {ORTH: "Fam.", LEMMA: "Familie"},
+    {ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"},
-    {ORTH: "Feb.", LEMMA: "Februar"},
+    {ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"},
-    {ORTH: "Fr.", LEMMA: "Frau"},
+    {ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"},
-    {ORTH: "Frl.", LEMMA: "Fräulein"},
+    {ORTH: "Frl.", LEMMA: "Fräulein", NORM: "Fräulein"},
-    {ORTH: "Hbf.", LEMMA: "Hauptbahnhof"},
+    {ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"},
-    {ORTH: "Hr.", LEMMA: "Herr"},
+    {ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"},
-    {ORTH: "Hrn.", LEMMA: "Herr"},
+    {ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"},
-    {ORTH: "Jan.", LEMMA: "Januar"},
+    {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
-    {ORTH: "Jh.", LEMMA: "Jahrhundert"},
+    {ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
-    {ORTH: "Jhd.", LEMMA: "Jahrhundert"},
+    {ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
-    {ORTH: "Jul.", LEMMA: "Juli"},
+    {ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"},
-    {ORTH: "Jun.", LEMMA: "Juni"},
+    {ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"},
-    {ORTH: "Mi.", LEMMA: "Mittwoch"},
+    {ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"},
-    {ORTH: "Mio.", LEMMA: "Million"},
+    {ORTH: "Mio.", LEMMA: "Million", NORM: "Million"},
-    {ORTH: "Mo.", LEMMA: "Montag"},
+    {ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"},
-    {ORTH: "Mrd.", LEMMA: "Milliarde"},
+    {ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"},
-    {ORTH: "Mrz.", LEMMA: "März"},
+    {ORTH: "Mrz.", LEMMA: "März", NORM: "März"},
-    {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"},
+    {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"},
-    {ORTH: "Mär.", LEMMA: "März"},
+    {ORTH: "Mär.", LEMMA: "März", NORM: "März"},
-    {ORTH: "Nov.", LEMMA: "November"},
+    {ORTH: "Nov.", LEMMA: "November", NORM: "November"},
-    {ORTH: "Nr.", LEMMA: "Nummer"},
+    {ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"},
-    {ORTH: "Okt.", LEMMA: "Oktober"},
+    {ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"},
-    {ORTH: "Orig.", LEMMA: "Original"},
+    {ORTH: "Orig.", LEMMA: "Original", NORM: "Original"},
-    {ORTH: "Pkt.", LEMMA: "Punkt"},
+    {ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"},
-    {ORTH: "Prof.", LEMMA: "Professor"},
+    {ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"},
-    {ORTH: "Red.", LEMMA: "Redaktion"},
+    {ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"},
-    {ORTH: "Sa.", LEMMA: "Samstag"},
+    {ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"},
-    {ORTH: "Sep.", LEMMA: "September"},
+    {ORTH: "Sep.", LEMMA: "September", NORM: "September"},
-    {ORTH: "Sept.", LEMMA: "September"},
+    {ORTH: "Sept.", LEMMA: "September", NORM: "September"},
-    {ORTH: "So.", LEMMA: "Sonntag"},
+    {ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"},
-    {ORTH: "Std.", LEMMA: "Stunde"},
+    {ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"},
-    {ORTH: "Str.", LEMMA: "Straße"},
+    {ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"},
-    {ORTH: "Tel.", LEMMA: "Telefon"},
+    {ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
-    {ORTH: "Tsd.", LEMMA: "Tausend"},
+    {ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"},
-    {ORTH: "Univ.", LEMMA: "Universität"},
+    {ORTH: "Univ.", LEMMA: "Universität", NORM: "Universität"},
-    {ORTH: "abzgl.", LEMMA: "abzüglich"},
+    {ORTH: "abzgl.", LEMMA: "abzüglich", NORM: "abzüglich"},
-    {ORTH: "allg.", LEMMA: "allgemein"},
+    {ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"},
-    {ORTH: "bspw.", LEMMA: "beispielsweise"},
+    {ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"},
-    {ORTH: "bzgl.", LEMMA: "bezüglich"},
+    {ORTH: "bzgl.", LEMMA: "bezüglich", NORM: "bezüglich"},
-    {ORTH: "bzw.", LEMMA: "beziehungsweise"},
+    {ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"},
    {ORTH: "d.h.", LEMMA: "das heißt"},
-    {ORTH: "dgl.", LEMMA: "dergleichen"},
+    {ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"},
-    {ORTH: "ebd.", LEMMA: "ebenda"},
+    {ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"},
-    {ORTH: "eigtl.", LEMMA: "eigentlich"},
+    {ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"},
-    {ORTH: "engl.", LEMMA: "englisch"},
+    {ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"},
-    {ORTH: "evtl.", LEMMA: "eventuell"},
+    {ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"},
-    {ORTH: "frz.", LEMMA: "französisch"},
+    {ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"},
-    {ORTH: "gegr.", LEMMA: "gegründet"},
+    {ORTH: "gegr.", LEMMA: "gegründet", NORM: "gegründet"},
-    {ORTH: "ggf.", LEMMA: "gegebenenfalls"},
+    {ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
-    {ORTH: "ggfs.", LEMMA: "gegebenenfalls"},
+    {ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
-    {ORTH: "ggü.", LEMMA: "gegenüber"},
+    {ORTH: "ggü.", LEMMA: "gegenüber", NORM: "gegenüber"},
    {ORTH: "i.O.", LEMMA: "in Ordnung"},
    {ORTH: "i.d.R.", LEMMA: "in der Regel"},
-    {ORTH: "incl.", LEMMA: "inklusive"},
+    {ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"},
-    {ORTH: "inkl.", LEMMA: "inklusive"},
+    {ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"},
-    {ORTH: "insb.", LEMMA: "insbesondere"},
+    {ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"},
-    {ORTH: "kath.", LEMMA: "katholisch"},
+    {ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"},
-    {ORTH: "lt.", LEMMA: "laut"},
+    {ORTH: "lt.", LEMMA: "laut", NORM: "laut"},
-    {ORTH: "max.", LEMMA: "maximal"},
+    {ORTH: "max.", LEMMA: "maximal", NORM: "maximal"},
-    {ORTH: "min.", LEMMA: "minimal"},
+    {ORTH: "min.", LEMMA: "minimal", NORM: "minimal"},
-    {ORTH: "mind.", LEMMA: "mindestens"},
+    {ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"},
-    {ORTH: "mtl.", LEMMA: "monatlich"},
+    {ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"},
    {ORTH: "n.Chr.", LEMMA: "nach Christus"},
-    {ORTH: "orig.", LEMMA: "original"},
+    {ORTH: "orig.", LEMMA: "original", NORM: "original"},
-    {ORTH: "röm.", LEMMA: "römisch"},
+    {ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"},
    {ORTH: "s.o.", LEMMA: "siehe oben"},
    {ORTH: "sog.", LEMMA: "so genannt"},
    {ORTH: "stellv.", LEMMA: "stellvertretend"},
-    {ORTH: "tägl.", LEMMA: "täglich"},
+    {ORTH: "tägl.", LEMMA: "täglich", NORM: "täglich"},
    {ORTH: "u.U.", LEMMA: "unter Umständen"},
    {ORTH: "u.s.w.", LEMMA: "und so weiter"},
    {ORTH: "u.v.m.", LEMMA: "und vieles mehr"},
@ -153,9 +153,9 @@ for exc_data in [
    {ORTH: "v.Chr.", LEMMA: "vor Christus"},
    {ORTH: "v.a.", LEMMA: "vor allem"},
    {ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"},
-    {ORTH: "vgl.", LEMMA: "vergleiche"},
+    {ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"},
-    {ORTH: "vllt.", LEMMA: "vielleicht"},
+    {ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"},
-    {ORTH: "vlt.", LEMMA: "vielleicht"},
+    {ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"},
    {ORTH: "z.B.", LEMMA: "zum Beispiel"},
    {ORTH: "z.Bsp.", LEMMA: "zum Beispiel"},
    {ORTH: "z.T.", LEMMA: "zum Teil"},
@ -163,7 +163,7 @@ for exc_data in [
    {ORTH: "z.Zt.", LEMMA: "zur Zeit"},
    {ORTH: "z.b.", LEMMA: "zum Beispiel"},
    {ORTH: "zzgl.", LEMMA: "zuzüglich"},
-    {ORTH: "österr.", LEMMA: "österreichisch"}]:
+    {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
    _exc[exc_data[ORTH]] = [dict(exc_data)]
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .norm_exceptions import NORM_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -10,14 +11,17 @@ from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
 from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
+from ...attrs import LANG, NORM
-from ...util import update_exc
+from ...util import update_exc, add_lookups
 class EnglishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'en'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
                                         BASE_NORMS, NORM_EXCEPTIONS)
    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
--- a/spacy/lang/en/norm_exceptions.py
+++ b/spacy/lang/en/norm_exceptions.py
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -15,20 +15,20 @@ _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
 for pron in ["i"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'m"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
-            {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]
+            {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}]
        _exc[orth + "m"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
        _exc[orth + "'ma"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "'m", LEMMA: "be", NORM: "am"},
            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
        _exc[orth + "ma"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "m", LEMMA: "be", NORM: "am"},
            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
@ -36,72 +36,72 @@ for pron in ["i"]:
 for pron in ["i", "you", "he", "she", "it", "we", "they"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'ll"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
+            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
        _exc[orth + "ll"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"}]
+            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
        _exc[orth + "'ll've"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
+            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
        _exc[orth + "llve"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
+            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
        _exc[orth + "'d"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
-            {ORTH: "'d", LEMMA: "would", TAG: "MD"}]
+            {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}]
        _exc[orth + "d"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
-            {ORTH: "d", LEMMA: "would", TAG: "MD"}]
+            {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}]
        _exc[orth + "'d've"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
-            {ORTH: "'d", LEMMA: "would", TAG: "MD"},
+            {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
        _exc[orth + "dve"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
-            {ORTH: "d", LEMMA: "would", TAG: "MD"},
+            {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
 for pron in ["i", "you", "we", "they"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'ve"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
        _exc[orth + "ve"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
 for pron in ["you", "we", "they"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'re"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "'re", LEMMA: "be", NORM: "are"}]
        _exc[orth + "re"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
 for pron in ["he", "she", "it"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'s"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
-            {ORTH: "'s"}]
+            {ORTH: "'s", NORM: "'s"}]
        _exc[orth + "s"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "s"}]
@ -110,111 +110,111 @@ for pron in ["he", "she", "it"]:
 for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
    for orth in [word, word.title()]:
        _exc[orth + "'s"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
-            {ORTH: "'s"}]
+            {ORTH: "'s", NORM: "'s"}]
        _exc[orth + "s"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "s"}]
        _exc[orth + "'ll"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
+            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
        _exc[orth + "ll"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"}]
+            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
        _exc[orth + "'ll've"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
+            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
        _exc[orth + "llve"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
+            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
        _exc[orth + "'re"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "'re", LEMMA: "be", NORM: "are"}]
        _exc[orth + "re"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "re", LEMMA: "be", NORM: "are"}]
        _exc[orth + "'ve"] = [
-            {ORTH: orth},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
        _exc[orth + "ve"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
        _exc[orth + "'d"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
-            {ORTH: "'d"}]
+            {ORTH: "'d", NORM: "'d"}]
        _exc[orth + "d"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "d"}]
        _exc[orth + "'d've"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
-            {ORTH: "'d", LEMMA: "would", TAG: "MD"},
+            {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
        _exc[orth + "dve"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
-            {ORTH: "d", LEMMA: "would", TAG: "MD"},
+            {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
 # Verbs
 for verb_data in [
-    {ORTH: "ca", LEMMA: "can", TAG: "MD"},
+    {ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"},
-    {ORTH: "could", TAG: "MD"},
+    {ORTH: "could", NORM: "could", TAG: "MD"},
-    {ORTH: "do", LEMMA: "do"},
+    {ORTH: "do", LEMMA: "do", NORM: "do"},
-    {ORTH: "does", LEMMA: "do"},
+    {ORTH: "does", LEMMA: "do", NORM: "does"},
-    {ORTH: "did", LEMMA: "do", TAG: "VBD"},
+    {ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"},
-    {ORTH: "had", LEMMA: "have", TAG: "VBD"},
+    {ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"},
-    {ORTH: "may", TAG: "MD"},
+    {ORTH: "may", NORM: "may", TAG: "MD"},
-    {ORTH: "might", TAG: "MD"},
+    {ORTH: "might", NORM: "might", TAG: "MD"},
-    {ORTH: "must", TAG: "MD"},
+    {ORTH: "must", NORM: "must", TAG: "MD"},
-    {ORTH: "need"},
+    {ORTH: "need", NORM: "need"},
-    {ORTH: "ought"},
+    {ORTH: "ought", NORM: "ought", TAG: "MD"},
-    {ORTH: "sha", LEMMA: "shall", TAG: "MD"},
+    {ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
-    {ORTH: "should", TAG: "MD"},
+    {ORTH: "should", NORM: "should", TAG: "MD"},
-    {ORTH: "wo", LEMMA: "will", TAG: "MD"},
+    {ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
-    {ORTH: "would", TAG: "MD"}]:
+    {ORTH: "would", NORM: "would", TAG: "MD"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
        _exc[data[ORTH] + "n't"] = [
            dict(data),
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
+            {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
        _exc[data[ORTH] + "nt"] = [
            dict(data),
-            {ORTH: "nt", LEMMA: "not", TAG: "RB"}]
+            {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
        _exc[data[ORTH] + "n't've"] = [
            dict(data),
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"},
+            {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
        _exc[data[ORTH] + "ntve"] = [
            dict(data),
-            {ORTH: "nt", LEMMA: "not", TAG: "RB"},
+            {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
 for verb_data in [
-    {ORTH: "could", TAG: "MD"},
+    {ORTH: "could", NORM: "could", TAG: "MD"},
-    {ORTH: "might"},
+    {ORTH: "might", NORM: "might", TAG: "MD"},
-    {ORTH: "must"},
+    {ORTH: "must", NORM: "must", TAG: "MD"},
-    {ORTH: "should"}]:
+    {ORTH: "should", NORM: "should", TAG: "MD"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
@ -228,21 +228,21 @@ for verb_data in [
 for verb_data in [
-    {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"},
+    {ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2},
-    {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
+    {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
-    {ORTH: "is", LEMMA: "be", TAG: "VBZ"},
+    {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
-    {ORTH: "was", LEMMA: "be"},
+    {ORTH: "was", LEMMA: "be", NORM: "was"},
-    {ORTH: "were", LEMMA: "be"}]:
+    {ORTH: "were", LEMMA: "be", NORM: "were"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
        _exc[data[ORTH] + "n't"] = [
            dict(data),
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
+            {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
        _exc[data[ORTH] + "nt"] = [
            dict(data),
-            {ORTH: "nt", LEMMA: "not", TAG: "RB"}]
+            {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
 # Other contractions with trailing apostrophe
@ -250,10 +250,10 @@ for verb_data in [
 for exc_data in [
    {ORTH: "doin", LEMMA: "do", NORM: "doing"},
    {ORTH: "goin", LEMMA: "go", NORM: "going"},
-    {ORTH: "nothin", LEMMA: "nothing"},
+    {ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
-    {ORTH: "nuthin", LEMMA: "nothing"},
+    {ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
-    {ORTH: "ol", LEMMA: "old"},
+    {ORTH: "ol", LEMMA: "old", NORM: "old"},
-    {ORTH: "somethin", LEMMA: "something"}]:
+    {ORTH: "somethin", LEMMA: "something", NORM: "something"}]:
    exc_data_tc = dict(exc_data)
    exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
    for data in [exc_data, exc_data_tc]:
@ -266,10 +266,10 @@ for exc_data in [
 # Other contractions with leading apostrophe
 for exc_data in [
-    {ORTH: "cause", LEMMA: "because"},
+    {ORTH: "cause", LEMMA: "because", NORM: "because"},
    {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
-    {ORTH: "ll", LEMMA: "will"},
+    {ORTH: "ll", LEMMA: "will", NORM: "will"},
-    {ORTH: "nuff", LEMMA: "enough"}]:
+    {ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]:
    exc_data_apos = dict(exc_data)
    exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
    for data in [exc_data, exc_data_apos]:
@ -282,11 +282,11 @@ for h in range(1, 12 + 1):
    for period in ["a.m.", "am"]:
        _exc["%d%s" % (h, period)] = [
            {ORTH: "%d" % h},
-            {ORTH: period, LEMMA: "a.m."}]
+            {ORTH: period, LEMMA: "a.m.", NORM: "a.m."}]
    for period in ["p.m.", "pm"]:
        _exc["%d%s" % (h, period)] = [
            {ORTH: "%d" % h},
-            {ORTH: period, LEMMA: "p.m."}]
+            {ORTH: period, LEMMA: "p.m.", NORM: "p.m."}]
 # Rest
@ -306,56 +306,56 @@ _other_exc = {
        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
    "How'd'y": [
-        {ORTH: "How", LEMMA: "how"},
+        {ORTH: "How", LEMMA: "how", NORM: "how"},
        {ORTH: "'d", LEMMA: "do"},
        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
    "not've": [
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
+        {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
    "notve": [
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}],
+        {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
    "Not've": [
-        {ORTH: "Not", LEMMA: "not", TAG: "RB"},
+        {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
+        {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
    "Notve": [
-        {ORTH: "Not", LEMMA: "not", TAG: "RB"},
+        {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}],
+        {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
    "cannot": [
        {ORTH: "can", LEMMA: "can", TAG: "MD"},
        {ORTH: "not", LEMMA: "not", TAG: "RB"}],
    "Cannot": [
-        {ORTH: "Can", LEMMA: "can", TAG: "MD"},
+        {ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
        {ORTH: "not", LEMMA: "not", TAG: "RB"}],
    "gonna": [
        {ORTH: "gon", LEMMA: "go", NORM: "going"},
-        {ORTH: "na", LEMMA: "to"}],
+        {ORTH: "na", LEMMA: "to", NORM: "to"}],
    "Gonna": [
        {ORTH: "Gon", LEMMA: "go", NORM: "going"},
-        {ORTH: "na", LEMMA: "to"}],
+        {ORTH: "na", LEMMA: "to", NORM: "to"}],
    "gotta": [
        {ORTH: "got"},
-        {ORTH: "ta", LEMMA: "to"}],
+        {ORTH: "ta", LEMMA: "to", NORM: "to"}],
    "Gotta": [
-        {ORTH: "Got"},
+        {ORTH: "Got", NORM: "got"},
-        {ORTH: "ta", LEMMA: "to"}],
+        {ORTH: "ta", LEMMA: "to", NORM: "to"}],
    "let's": [
        {ORTH: "let"},
        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
    "Let's": [
-        {ORTH: "Let", LEMMA: "let"},
+        {ORTH: "Let", LEMMA: "let", NORM: "let"},
        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
 }
@ -363,72 +363,80 @@ _exc.update(_other_exc)
 for exc_data in [
-    {ORTH: "'S", LEMMA: "'s"},
+    {ORTH: "'S", LEMMA: "'s", NORM: "'s"},
-    {ORTH: "'s", LEMMA: "'s"},
+    {ORTH: "'s", LEMMA: "'s", NORM: "'s"},
-    {ORTH: "\u2018S", LEMMA: "'s"},
+    {ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"},
-    {ORTH: "\u2018s", LEMMA: "'s"},
+    {ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"},
-    {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
+    {ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"},
    {ORTH: "w/o", LEMMA: "without", NORM: "without"},
    {ORTH: "'re", LEMMA: "be", NORM: "are"},
-    {ORTH: "'Cause", LEMMA: "because"},
+    {ORTH: "'Cause", LEMMA: "because", NORM: "because"},
-    {ORTH: "'cause", LEMMA: "because"},
+    {ORTH: "'cause", LEMMA: "because", NORM: "because"},
-    {ORTH: "ma'am", LEMMA: "madam"},
+    {ORTH: "'cos", LEMMA: "because", NORM: "because"},
-    {ORTH: "Ma'am", LEMMA: "madam"},
+    {ORTH: "'Cos", LEMMA: "because", NORM: "because"},
-    {ORTH: "o'clock", LEMMA: "o'clock"},
+    {ORTH: "'coz", LEMMA: "because", NORM: "because"},
-    {ORTH: "O'clock", LEMMA: "o'clock"},
+    {ORTH: "'Coz", LEMMA: "because", NORM: "because"},
    {ORTH: "'cuz", LEMMA: "because", NORM: "because"},
    {ORTH: "'Cuz", LEMMA: "because", NORM: "because"},
    {ORTH: "'bout", LEMMA: "about", NORM: "about"},
    {ORTH: "ma'am", LEMMA: "madam", NORM: "madam"},
    {ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"},
    {ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"},
    {ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
-    {ORTH: "Mt.", LEMMA: "Mount"},
+    {ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
-    {ORTH: "Ak.", LEMMA: "Alaska"},
+    {ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
-    {ORTH: "Ala.", LEMMA: "Alabama"},
+    {ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
-    {ORTH: "Apr.", LEMMA: "April"},
+    {ORTH: "Apr.", LEMMA: "April", NORM: "April"},
-    {ORTH: "Ariz.", LEMMA: "Arizona"},
+    {ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"},
-    {ORTH: "Ark.", LEMMA: "Arkansas"},
+    {ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"},
-    {ORTH: "Aug.", LEMMA: "August"},
+    {ORTH: "Aug.", LEMMA: "August", NORM: "August"},
-    {ORTH: "Calif.", LEMMA: "California"},
+    {ORTH: "Calif.", LEMMA: "California", NORM: "California"},
-    {ORTH: "Colo.", LEMMA: "Colorado"},
+    {ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"},
-    {ORTH: "Conn.", LEMMA: "Connecticut"},
+    {ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"},
-    {ORTH: "Dec.", LEMMA: "December"},
+    {ORTH: "Dec.", LEMMA: "December", NORM: "December"},
-    {ORTH: "Del.", LEMMA: "Delaware"},
+    {ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"},
-    {ORTH: "Feb.", LEMMA: "February"},
+    {ORTH: "Feb.", LEMMA: "February", NORM: "February"},
-    {ORTH: "Fla.", LEMMA: "Florida"},
+    {ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"},
-    {ORTH: "Ga.", LEMMA: "Georgia"},
+    {ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"},
-    {ORTH: "Ia.", LEMMA: "Iowa"},
+    {ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"},
-    {ORTH: "Id.", LEMMA: "Idaho"},
+    {ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"},
-    {ORTH: "Ill.", LEMMA: "Illinois"},
+    {ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"},
-    {ORTH: "Ind.", LEMMA: "Indiana"},
+    {ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"},
-    {ORTH: "Jan.", LEMMA: "January"},
+    {ORTH: "Jan.", LEMMA: "January", NORM: "January"},
-    {ORTH: "Jul.", LEMMA: "July"},
+    {ORTH: "Jul.", LEMMA: "July", NORM: "July"},
-    {ORTH: "Jun.", LEMMA: "June"},
+    {ORTH: "Jun.", LEMMA: "June", NORM: "June"},
-    {ORTH: "Kan.", LEMMA: "Kansas"},
+    {ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"},
-    {ORTH: "Kans.", LEMMA: "Kansas"},
+    {ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"},
-    {ORTH: "Ky.", LEMMA: "Kentucky"},
+    {ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"},
-    {ORTH: "La.", LEMMA: "Louisiana"},
+    {ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"},
-    {ORTH: "Mar.", LEMMA: "March"},
+    {ORTH: "Mar.", LEMMA: "March", NORM: "March"},
-    {ORTH: "Mass.", LEMMA: "Massachusetts"},
+    {ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"},
-    {ORTH: "May.", LEMMA: "May"},
+    {ORTH: "May.", LEMMA: "May", NORM: "May"},
-    {ORTH: "Mich.", LEMMA: "Michigan"},
+    {ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"},
-    {ORTH: "Minn.", LEMMA: "Minnesota"},
+    {ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"},
-    {ORTH: "Miss.", LEMMA: "Mississippi"},
+    {ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"},
-    {ORTH: "N.C.", LEMMA: "North Carolina"},
+    {ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"},
-    {ORTH: "N.D.", LEMMA: "North Dakota"},
+    {ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"},
-    {ORTH: "N.H.", LEMMA: "New Hampshire"},
+    {ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"},
-    {ORTH: "N.J.", LEMMA: "New Jersey"},
+    {ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"},
-    {ORTH: "N.M.", LEMMA: "New Mexico"},
+    {ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"},
-    {ORTH: "N.Y.", LEMMA: "New York"},
+    {ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"},
-    {ORTH: "Neb.", LEMMA: "Nebraska"},
+    {ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"},
-    {ORTH: "Nebr.", LEMMA: "Nebraska"},
+    {ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"},
-    {ORTH: "Nev.", LEMMA: "Nevada"},
+    {ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"},
-    {ORTH: "Nov.", LEMMA: "November"},
+    {ORTH: "Nov.", LEMMA: "November", NORM: "November"},
-    {ORTH: "Oct.", LEMMA: "October"},
+    {ORTH: "Oct.", LEMMA: "October", NORM: "October"},
-    {ORTH: "Okla.", LEMMA: "Oklahoma"},
+    {ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"},
-    {ORTH: "Ore.", LEMMA: "Oregon"},
+    {ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"},
-    {ORTH: "Pa.", LEMMA: "Pennsylvania"},
+    {ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"},
-    {ORTH: "S.C.", LEMMA: "South Carolina"},
+    {ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"},
-    {ORTH: "Sep.", LEMMA: "September"},
+    {ORTH: "Sep.", LEMMA: "September", NORM: "September"},
-    {ORTH: "Sept.", LEMMA: "September"},
+    {ORTH: "Sept.", LEMMA: "September", NORM: "September"},
-    {ORTH: "Tenn.", LEMMA: "Tennessee"},
+    {ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
-    {ORTH: "Va.", LEMMA: "Virginia"},
+    {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
-    {ORTH: "Wash.", LEMMA: "Washington"},
+    {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
-    {ORTH: "Wis.", LEMMA: "Wisconsin"}]:
+    {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
    _exc[exc_data[ORTH]] = [dict(exc_data)]
--- a/spacy/lang/norm_exceptions.py
+++ b/spacy/lang/norm_exceptions.py
@ -0,0 +1,46 @@
 # coding: utf8
 from __future__ import unicode_literals
 # These exceptions are used to add NORM values based on a token's ORTH value.
 # Individual languages can also add their own exceptions and overwrite them -
 # for example, British vs. American spelling in English.
 # Norms are only set if no alternative is provided in the tokenizer exceptions.
 # Note that this does not change any other token attributes. Its main purpose
 # is to normalise the word representations so that equivalent tokens receive
 # similar representations. For example: $ and € are very different, but they're
 # both currency symbols. By normalising currency symbols to $, all symbols are
 # seen as similar, no matter how common they are in the training data.
 BASE_NORMS = {
    "'s": "'s",
    "'S": "'s",
    "’s": "'s",
    "’S": "'s",
    "’": "'",
    "‘": "'",
    "´": "'",
    "`": "'",
    "”": '"',
    "“": '"',
    "''": '"',
    "``": '"',
    "´´": '"',
    "„": '"',
    "»": '"',
    "«": '"',
    "…": "...",
    "—": "-",
    "–": "-",
    "--": "-",
    "---": "-",
    "€": "$",
    "£": "$",
    "¥": "$",
    "฿": "$",
    "US$": "$",
    "C$": "$",
    "A$": "$"
 }
--- a/spacy/language.py
+++ b/spacy/language.py
@ -301,7 +301,7 @@ class Language(object):
    def evaluate(self, docs_golds):
        docs, golds = zip(*docs_golds)
        scorer = Scorer()
-        for doc, gold in zip(self.pipe(docs), golds):
+        for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
            scorer.score(doc, gold)
            doc.tensor = None
        return scorer
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -38,7 +38,7 @@ cdef class Morphology:
        self.strings = string_store
        self.tag_map = {}
        self.lemmatizer = lemmatizer
-        self.n_tags = len(tag_map) + 1
+        self.n_tags = len(tag_map)
        self.tag_names = tuple(sorted(tag_map.keys()))
        self.reverse_index = {}
--- a/spacy/tests/lang/de/test_exceptions.py
+++ b/spacy/tests/lang/de/test_exceptions.py
@ -8,20 +8,33 @@ import pytest
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
-def test_tokenizer_splits_contractions(de_tokenizer, text):
+def test_de_tokenizer_splits_contractions(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 2
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
-def test_tokenizer_handles_abbr(de_tokenizer, text):
+def test_de_tokenizer_handles_abbr(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 1
-def test_tokenizer_handles_exc_in_text(de_tokenizer):
+def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
    text = "Ich bin z.Zt. im Urlaub."
    tokens = de_tokenizer(text)
    assert len(tokens) == 6
    assert tokens[2].text == "z.Zt."
    assert tokens[2].lemma_ == "zur Zeit"
@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])])
 def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
    tokens = de_tokenizer(text)
    assert [token.norm_ for token in tokens] == norms
@pytest.mark.xfail
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
 def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
    tokens = de_tokenizer(text)
    assert tokens[0].norm_ == norm
--- a/spacy/tests/lang/en/test_exceptions.py
+++ b/spacy/tests/lang/en/test_exceptions.py
@ -102,3 +102,16 @@ def test_en_tokenizer_handles_times(en_tokenizer, text):
    tokens = en_tokenizer(text)
    assert len(tokens) == 2
    assert tokens[1].lemma_ in ["a.m.", "p.m."]
@pytest.mark.parametrize('text,norms', [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])])
 def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
    tokens = en_tokenizer(text)
    assert [token.norm_ for token in tokens] == norms
@pytest.mark.xfail
@pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")])
 def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
    tokens = en_tokenizer(text)
    assert tokens[0].norm_ == norm
--- a/spacy/tests/serialize/test_serialize_tokenizer.py
+++ b/spacy/tests/serialize/test_serialize_tokenizer.py
@ -0,0 +1,33 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from ...util import get_lang_class
 from ..util import make_tempdir, assert_packed_msg_equal
 import pytest
 def load_tokenizer(b):
    tok = get_lang_class('en').Defaults.create_tokenizer()
    tok.from_bytes(b)
    return tok
@pytest.mark.parametrize('text', ["I💜you", "they’re", "“hello”"])
 def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
    tokenizer = en_tokenizer
    new_tokenizer = load_tokenizer(tokenizer.to_bytes())
    assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes())
    # assert new_tokenizer.to_bytes() == tokenizer.to_bytes()
    doc1 = tokenizer(text)
    doc2 = new_tokenizer(text)
    assert [token.text for token in doc1] == [token.text for token in doc2]
 def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
    tokenizer = en_tokenizer
    with make_tempdir() as d:
        file_path = d / 'tokenizer'
        tokenizer.to_disk(file_path)
        tokenizer_d = en_tokenizer.from_disk(file_path)
        assert tokenizer.to_bytes() == tokenizer_d.to_bytes()
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -10,6 +10,7 @@ import numpy
 import tempfile
 import shutil
 import contextlib
 import msgpack
 from pathlib import Path
@ -105,3 +106,13 @@ def assert_docs_equal(doc1, doc2):
    assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]
    assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]
    assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]
 def assert_packed_msg_equal(b1, b2):
    """Assert that two packed msgpack messages are equal."""
    msg1 = msgpack.loads(b1, encoding='utf8')
    msg2 = msgpack.loads(b2, encoding='utf8')
    assert sorted(msg1.keys()) == sorted(msg2.keys())
    for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
        assert k1 == k2
        assert v1 == v2
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -2,6 +2,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 from collections import OrderedDict
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
 from cymem.cymem cimport Pool
@ -355,14 +356,14 @@ cdef class Tokenizer:
        **exclude: Named attributes to prevent from being serialized.
        RETURNS (bytes): The serialized form of the `Tokenizer` object.
        """
-        serializers = {
+        serializers = OrderedDict((
-            'vocab': lambda: self.vocab.to_bytes(),
+            ('vocab', lambda: self.vocab.to_bytes()),
-            'prefix_search': lambda: self.prefix_search.__self__.pattern,
+            ('prefix_search', lambda: self.prefix_search.__self__.pattern),
-            'suffix_search': lambda: self.suffix_search.__self__.pattern,
+            ('suffix_search', lambda: self.suffix_search.__self__.pattern),
-            'infix_finditer': lambda: self.infix_finditer.__self__.pattern,
+            ('infix_finditer', lambda: self.infix_finditer.__self__.pattern),
-            'token_match': lambda: self.token_match.__self__.pattern,
+            ('token_match', lambda: self.token_match.__self__.pattern),
-            'exceptions': lambda: self._rules
+            ('exceptions', lambda: OrderedDict(sorted(self._rules.items())))
-        }
+        ))
        return util.to_bytes(serializers, exclude)
    def from_bytes(self, bytes_data, **exclude):
@ -372,15 +373,15 @@ cdef class Tokenizer:
        **exclude: Named attributes to prevent from being loaded.
        RETURNS (Tokenizer): The `Tokenizer` object.
        """
-        data = {}
+        data = OrderedDict()
-        deserializers = {
+        deserializers = OrderedDict((
-            'vocab': lambda b: self.vocab.from_bytes(b),
+            ('vocab', lambda b: self.vocab.from_bytes(b)),
-            'prefix_search': lambda b: data.setdefault('prefix', b),
+            ('prefix_search', lambda b: data.setdefault('prefix', b)),
-            'suffix_search': lambda b: data.setdefault('suffix_search', b),
+            ('suffix_search', lambda b: data.setdefault('suffix_search', b)),
-            'infix_finditer': lambda b: data.setdefault('infix_finditer', b),
+            ('infix_finditer', lambda b: data.setdefault('infix_finditer', b)),
-            'token_match': lambda b: data.setdefault('token_match', b),
+            ('token_match', lambda b: data.setdefault('token_match', b)),
-            'exceptions': lambda b: data.setdefault('rules', b)
+            ('exceptions', lambda b: data.setdefault('rules', b))
-        }
+        ))
        msg = util.from_bytes(bytes_data, deserializers, exclude)
        if 'prefix_search' in data:
            self.prefix_search = re.compile(data['prefix_search']).search
@ -392,3 +393,4 @@ cdef class Tokenizer:
            self.token_match = re.compile(data['token_match']).search
        for string, substrings in data.get('rules', {}).items():
            self.add_special_case(string, substrings)
        return self
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -437,7 +437,8 @@ cdef class Doc:
        """
        def __get__(self):
            if 'sents' in self.user_hooks:
-                return self.user_hooks['sents'](self)
+                yield from self.user_hooks['sents'](self)
                return
            if not self.is_parsed:
                raise ValueError(
@ -740,7 +741,7 @@ cdef class Doc:
        token.spacy = self.c[end-1].spacy
        for attr_name, attr_value in attributes.items():
            if attr_name == TAG:
-                self.vocab.morphology.assign_tag(token, attr_value) 
+                self.vocab.morphology.assign_tag(token, attr_value)
            else:
                Token.set_struct_attr(token, attr_name, attr_value)
        # Begin by setting all the head indices to absolute token positions
--- a/spacy/util.py
+++ b/spacy/util.py
@ -299,6 +299,22 @@ def compile_infix_regex(entries):
    return re.compile(expression)
 def add_lookups(default_func, *lookups):
    """Extend an attribute function with special cases. If a word is in the
    lookups, the value is returned. Otherwise the previous function is used.
    default_func (callable): The default function to execute.
    *lookups (dict): Lookup dictionary mapping string to attribute value.
    RETURNS (callable): Lexical attribute getter.
    """
    def get_attr(string):
        for lookup in lookups:
            if string in lookup:
                return lookup[string]
        return default_func(string)
    return get_attr
 def update_exc(base_exceptions, *addition_dicts):
    """Update and validate tokenizer exceptions. Will overwrite exceptions.
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -231,11 +231,13 @@ cdef class Vocab:
            props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
            token = &tokens[i]
            # Set the special tokens up to have arbitrary attributes
-            token.lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
+            lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
            token.lex = lex
            if attrs.TAG in props:
                self.morphology.assign_tag(token, props[attrs.TAG])
            for attr_id, value in props.items():
                Token.set_struct_attr(token, attr_id, value)
                Lexeme.set_struct_attr(lex, attr_id, value)
        return tokens
    @property
--- a/website/docs/api/displacy.jade
+++ b/website/docs/api/displacy.jade
@ -205,7 +205,7 @@ p
        +cell #[code arrow_spacing]
        +cell int
        +cell Spacing between arrows in px to avoid overlaps.
-        +cell #[code 20]
+        +cell #[code 20] / #[code 12] (compact)
    +row
        +cell #[code word_spacing]
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@ -64,7 +64,7 @@ p
    doc = nlp(u'Give it back! He pleaded.')
    assert doc[0].text == 'Give'
    assert doc[-1].text == '.'
-    span = doc[1:1]
+    span = doc[1:3]
    assert span.text == 'it back'
 +table(["Name", "Type", "Description"])
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@ -141,7 +141,7 @@ p
                else:
                    tokens.append(substring)
                    substring = ''
-            tokens.extend(suffixes)
+            tokens.extend(reversed(suffixes))
            return tokens
 p
--- a/website/docs/usage/visualizers.jade
+++ b/website/docs/usage/visualizers.jade
@ -59,9 +59,11 @@ p
    |  to customise the layout, for example:
 +aside("Important note")
-    |  There's currently a known issue with the #[code compact] mode for long
+    |  There's currently a known issue with the #[code compact] mode for
-    |  sentences with arrow spacing. If the spacing is larger than the arc
+    |  sentences with short arrows and long dependency labels, that causes labels
-    |  itself, it'll cause the arc and its label to flip.
+    |  longer than the arrow to wrap. So if you come across this problem,
    |  especially when using custom labels, you'll have to increase the
    |  #[code distance] setting in the #[code options] to allow longer arcs.
 +table(["Name", "Type", "Description", "Default"])
    +row