Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-10-18 09:44:16 +03:00 · 2017-06-03 13:31:40 -05:00 · 2017-06-03 13:31:40 -05:00 · 94e063ae2a
commit 94e063ae2a
parent fea1144e6d 746653880c
19 changed files with 2109 additions and 216 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ pathlib
 numpy>=1.7
 cymem>=1.30,<1.32
 preshed>=1.0.0,<2.0.0
-thinc>=6.7.1,<6.8.0
+thinc>=6.7.2,<6.8.0
 murmurhash>=0.28,<0.29
 plac<1.0.0,>=0.9.6
 six
--- a/setup.py
+++ b/setup.py
@ -191,7 +191,7 @@ def setup_package():
                'murmurhash>=0.28,<0.29',
                'cymem>=1.30,<1.32',
                'preshed>=1.0.0,<2.0.0',
-                'thinc>=6.7.1,<6.8.0',
+                'thinc>=6.7.2,<6.8.0',
                'plac<1.0.0,>=0.9.6',
                'pip>=9.0.0,<10.0.0',
                'six',
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -56,7 +56,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
    httpd = simple_server.make_server('0.0.0.0', port, app)
    prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
-    httpd.serve_forever()
+    try:
+        httpd.serve_forever()
+    except KeyboardInterrupt:
+        prints("Shutting down server on port %d." % port)
+    finally:
+        httpd.server_close()


 def app(environ, start_response):
@ -65,12 +70,13 @@ def app(environ, start_response):
    return [res]


-def parse_deps(doc, options={}):
+def parse_deps(orig_doc, options={}):
    """Generate dependency parse in {'words': [], 'arcs': []} format.

    doc (Doc): Document do parse.
    RETURNS (dict): Generated dependency parse keyed by words and arcs.
    """
+    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
    if options.get('collapse_punct', True):
        spans = []
        for word in doc[:-1]:
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -18,12 +18,11 @@ class DependencyRenderer(object):
                        offset_x, color, bg, font)
        """
        self.compact = options.get('compact', False)
-        distance, arrow_width = (85, 8) if self.compact else (175, 10)
        self.word_spacing = options.get('word_spacing', 45)
-        self.arrow_spacing = options.get('arrow_spacing', 20)
-        self.arrow_width = options.get('arrow_width', arrow_width)
+        self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
+        self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
        self.arrow_stroke = options.get('arrow_stroke', 2)
-        self.distance = options.get('distance', distance)
+        self.distance = options.get('distance', 150 if self.compact else 175)
        self.offset_x = options.get('offset_x', 50)
        self.color = options.get('color', '#000000')
        self.bg = options.get('bg', '#ffffff')
@ -99,6 +98,8 @@ class DependencyRenderer(object):
        x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
                 -self.arrow_spacing*(self.highest_level-level)/4)
        y_curve = self.offset_y-level*self.distance/2
+        if self.compact:
+            y_curve = self.offset_y-level*self.distance/6
        if y_curve == 0 and len(self.levels) > 5:
            y_curve = -self.distance
        arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@ -21,7 +21,7 @@ TPL_DEP_WORDS = """
 TPL_DEP_ARCS = """
 <g class="displacy-arrow">
    <path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
-    <text dy="1.25em" style="font-size: 0.8em">
+    <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
        <textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath>
    </text>
    <path class="displacy-arrowhead" d="{head}" fill="currentColor"/>
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .norm_exceptions import NORM_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -10,14 +11,17 @@ from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
 from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups


 class EnglishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'en'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
+                                         BASE_NORMS, NORM_EXCEPTIONS)
    lex_attr_getters.update(LEX_ATTRS)

    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
--- a/spacy/lang/en/norm_exceptions.py
+++ b/spacy/lang/en/norm_exceptions.py
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -15,20 +15,20 @@ _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
 for pron in ["i"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'m"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}]

        _exc[orth + "m"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]

        _exc[orth + "'ma"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "'m", LEMMA: "be", NORM: "am"},
            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]

        _exc[orth + "ma"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "m", LEMMA: "be", NORM: "am"},
            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]

@ -36,72 +36,72 @@ for pron in ["i"]:
 for pron in ["i", "you", "he", "she", "it", "we", "they"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'ll"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]

        _exc[orth + "ll"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]

        _exc[orth + "'ll've"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "llve"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "'d"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'d", LEMMA: "would", TAG: "MD"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}]

        _exc[orth + "d"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "d", LEMMA: "would", TAG: "MD"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}]

        _exc[orth + "'d've"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "dve"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]


 for pron in ["i", "you", "we", "they"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'ve"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "ve"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]


 for pron in ["you", "we", "they"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'re"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "'re", LEMMA: "be", NORM: "are"}]

        _exc[orth + "re"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]


 for pron in ["he", "she", "it"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'s"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'s"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'s", NORM: "'s"}]

        _exc[orth + "s"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "s"}]


@ -110,111 +110,111 @@ for pron in ["he", "she", "it"]:
 for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
    for orth in [word, word.title()]:
        _exc[orth + "'s"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "'s"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "'s", NORM: "'s"}]

        _exc[orth + "s"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "s"}]

        _exc[orth + "'ll"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]

        _exc[orth + "ll"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]

        _exc[orth + "'ll've"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "llve"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "'re"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "'re", LEMMA: "be", NORM: "are"}]

        _exc[orth + "re"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "re", LEMMA: "be", NORM: "are"}]

        _exc[orth + "'ve"] = [
-            {ORTH: orth},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]

        _exc[orth + "ve"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "'d"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "'d"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "'d", NORM: "'d"}]

        _exc[orth + "d"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "d"}]

        _exc[orth + "'d've"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "dve"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]


 # Verbs

 for verb_data in [
-    {ORTH: "ca", LEMMA: "can", TAG: "MD"},
-    {ORTH: "could", TAG: "MD"},
-    {ORTH: "do", LEMMA: "do"},
-    {ORTH: "does", LEMMA: "do"},
-    {ORTH: "did", LEMMA: "do", TAG: "VBD"},
-    {ORTH: "had", LEMMA: "have", TAG: "VBD"},
-    {ORTH: "may", TAG: "MD"},
-    {ORTH: "might", TAG: "MD"},
-    {ORTH: "must", TAG: "MD"},
-    {ORTH: "need"},
-    {ORTH: "ought"},
-    {ORTH: "sha", LEMMA: "shall", TAG: "MD"},
-    {ORTH: "should", TAG: "MD"},
-    {ORTH: "wo", LEMMA: "will", TAG: "MD"},
-    {ORTH: "would", TAG: "MD"}]:
+    {ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"},
+    {ORTH: "could", NORM: "could", TAG: "MD"},
+    {ORTH: "do", LEMMA: "do", NORM: "do"},
+    {ORTH: "does", LEMMA: "do", NORM: "does"},
+    {ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"},
+    {ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"},
+    {ORTH: "may", NORM: "may", TAG: "MD"},
+    {ORTH: "might", NORM: "might", TAG: "MD"},
+    {ORTH: "must", NORM: "must", TAG: "MD"},
+    {ORTH: "need", NORM: "need"},
+    {ORTH: "ought", NORM: "ought", TAG: "MD"},
+    {ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
+    {ORTH: "should", NORM: "should", TAG: "MD"},
+    {ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
+    {ORTH: "would", NORM: "would", TAG: "MD"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
        _exc[data[ORTH] + "n't"] = [
            dict(data),
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
+            {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]

        _exc[data[ORTH] + "nt"] = [
            dict(data),
-            {ORTH: "nt", LEMMA: "not", TAG: "RB"}]
+            {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]

        _exc[data[ORTH] + "n't've"] = [
            dict(data),
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[data[ORTH] + "ntve"] = [
            dict(data),
-            {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]


 for verb_data in [
-    {ORTH: "could", TAG: "MD"},
-    {ORTH: "might"},
-    {ORTH: "must"},
-    {ORTH: "should"}]:
+    {ORTH: "could", NORM: "could", TAG: "MD"},
+    {ORTH: "might", NORM: "might", TAG: "MD"},
+    {ORTH: "must", NORM: "must", TAG: "MD"},
+    {ORTH: "should", NORM: "should", TAG: "MD"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
@ -228,21 +228,21 @@ for verb_data in [


 for verb_data in [
-    {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"},
-    {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
-    {ORTH: "is", LEMMA: "be", TAG: "VBZ"},
-    {ORTH: "was", LEMMA: "be"},
-    {ORTH: "were", LEMMA: "be"}]:
+    {ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2},
+    {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
+    {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
+    {ORTH: "was", LEMMA: "be", NORM: "was"},
+    {ORTH: "were", LEMMA: "be", NORM: "were"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
        _exc[data[ORTH] + "n't"] = [
            dict(data),
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
+            {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]

        _exc[data[ORTH] + "nt"] = [
            dict(data),
-            {ORTH: "nt", LEMMA: "not", TAG: "RB"}]
+            {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]


 # Other contractions with trailing apostrophe
@ -250,10 +250,10 @@ for verb_data in [
 for exc_data in [
    {ORTH: "doin", LEMMA: "do", NORM: "doing"},
    {ORTH: "goin", LEMMA: "go", NORM: "going"},
-    {ORTH: "nothin", LEMMA: "nothing"},
-    {ORTH: "nuthin", LEMMA: "nothing"},
-    {ORTH: "ol", LEMMA: "old"},
-    {ORTH: "somethin", LEMMA: "something"}]:
+    {ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
+    {ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
+    {ORTH: "ol", LEMMA: "old", NORM: "old"},
+    {ORTH: "somethin", LEMMA: "something", NORM: "something"}]:
    exc_data_tc = dict(exc_data)
    exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
    for data in [exc_data, exc_data_tc]:
@ -266,10 +266,10 @@ for exc_data in [
 # Other contractions with leading apostrophe

 for exc_data in [
-    {ORTH: "cause", LEMMA: "because"},
+    {ORTH: "cause", LEMMA: "because", NORM: "because"},
    {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
-    {ORTH: "ll", LEMMA: "will"},
-    {ORTH: "nuff", LEMMA: "enough"}]:
+    {ORTH: "ll", LEMMA: "will", NORM: "will"},
+    {ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]:
    exc_data_apos = dict(exc_data)
    exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
    for data in [exc_data, exc_data_apos]:
@ -282,11 +282,11 @@ for h in range(1, 12 + 1):
    for period in ["a.m.", "am"]:
        _exc["%d%s" % (h, period)] = [
            {ORTH: "%d" % h},
-            {ORTH: period, LEMMA: "a.m."}]
+            {ORTH: period, LEMMA: "a.m.", NORM: "a.m."}]
    for period in ["p.m.", "pm"]:
        _exc["%d%s" % (h, period)] = [
            {ORTH: "%d" % h},
-            {ORTH: period, LEMMA: "p.m."}]
+            {ORTH: period, LEMMA: "p.m.", NORM: "p.m."}]


 # Rest
@ -306,56 +306,56 @@ _other_exc = {
        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],

    "How'd'y": [
-        {ORTH: "How", LEMMA: "how"},
+        {ORTH: "How", LEMMA: "how", NORM: "how"},
        {ORTH: "'d", LEMMA: "do"},
        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],

    "not've": [
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
+        {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],

    "notve": [
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}],
+        {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],

    "Not've": [
-        {ORTH: "Not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
+        {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
+        {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],

    "Notve": [
-        {ORTH: "Not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}],
+        {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
+        {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],

    "cannot": [
        {ORTH: "can", LEMMA: "can", TAG: "MD"},
        {ORTH: "not", LEMMA: "not", TAG: "RB"}],

    "Cannot": [
-        {ORTH: "Can", LEMMA: "can", TAG: "MD"},
+        {ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
        {ORTH: "not", LEMMA: "not", TAG: "RB"}],

    "gonna": [
        {ORTH: "gon", LEMMA: "go", NORM: "going"},
-        {ORTH: "na", LEMMA: "to"}],
+        {ORTH: "na", LEMMA: "to", NORM: "to"}],

    "Gonna": [
        {ORTH: "Gon", LEMMA: "go", NORM: "going"},
-        {ORTH: "na", LEMMA: "to"}],
+        {ORTH: "na", LEMMA: "to", NORM: "to"}],

    "gotta": [
        {ORTH: "got"},
-        {ORTH: "ta", LEMMA: "to"}],
+        {ORTH: "ta", LEMMA: "to", NORM: "to"}],

    "Gotta": [
-        {ORTH: "Got"},
-        {ORTH: "ta", LEMMA: "to"}],
+        {ORTH: "Got", NORM: "got"},
+        {ORTH: "ta", LEMMA: "to", NORM: "to"}],

    "let's": [
        {ORTH: "let"},
        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],

    "Let's": [
-        {ORTH: "Let", LEMMA: "let"},
+        {ORTH: "Let", LEMMA: "let", NORM: "let"},
        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
 }

@ -363,72 +363,80 @@ _exc.update(_other_exc)


 for exc_data in [
-    {ORTH: "'S", LEMMA: "'s"},
-    {ORTH: "'s", LEMMA: "'s"},
-    {ORTH: "\u2018S", LEMMA: "'s"},
-    {ORTH: "\u2018s", LEMMA: "'s"},
-    {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
+    {ORTH: "'S", LEMMA: "'s", NORM: "'s"},
+    {ORTH: "'s", LEMMA: "'s", NORM: "'s"},
+    {ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"},
+    {ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"},
+    {ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"},
+    {ORTH: "w/o", LEMMA: "without", NORM: "without"},
    {ORTH: "'re", LEMMA: "be", NORM: "are"},
-    {ORTH: "'Cause", LEMMA: "because"},
-    {ORTH: "'cause", LEMMA: "because"},
-    {ORTH: "ma'am", LEMMA: "madam"},
-    {ORTH: "Ma'am", LEMMA: "madam"},
-    {ORTH: "o'clock", LEMMA: "o'clock"},
-    {ORTH: "O'clock", LEMMA: "o'clock"},
+    {ORTH: "'Cause", LEMMA: "because", NORM: "because"},
+    {ORTH: "'cause", LEMMA: "because", NORM: "because"},
+    {ORTH: "'cos", LEMMA: "because", NORM: "because"},
+    {ORTH: "'Cos", LEMMA: "because", NORM: "because"},
+    {ORTH: "'coz", LEMMA: "because", NORM: "because"},
+    {ORTH: "'Coz", LEMMA: "because", NORM: "because"},
+    {ORTH: "'cuz", LEMMA: "because", NORM: "because"},
+    {ORTH: "'Cuz", LEMMA: "because", NORM: "because"},
+    {ORTH: "'bout", LEMMA: "about", NORM: "about"},
+    {ORTH: "ma'am", LEMMA: "madam", NORM: "madam"},
+    {ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"},
+    {ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"},
+    {ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},

-    {ORTH: "Mt.", LEMMA: "Mount"},
-    {ORTH: "Ak.", LEMMA: "Alaska"},
-    {ORTH: "Ala.", LEMMA: "Alabama"},
-    {ORTH: "Apr.", LEMMA: "April"},
-    {ORTH: "Ariz.", LEMMA: "Arizona"},
-    {ORTH: "Ark.", LEMMA: "Arkansas"},
-    {ORTH: "Aug.", LEMMA: "August"},
-    {ORTH: "Calif.", LEMMA: "California"},
-    {ORTH: "Colo.", LEMMA: "Colorado"},
-    {ORTH: "Conn.", LEMMA: "Connecticut"},
-    {ORTH: "Dec.", LEMMA: "December"},
-    {ORTH: "Del.", LEMMA: "Delaware"},
-    {ORTH: "Feb.", LEMMA: "February"},
-    {ORTH: "Fla.", LEMMA: "Florida"},
-    {ORTH: "Ga.", LEMMA: "Georgia"},
-    {ORTH: "Ia.", LEMMA: "Iowa"},
-    {ORTH: "Id.", LEMMA: "Idaho"},
-    {ORTH: "Ill.", LEMMA: "Illinois"},
-    {ORTH: "Ind.", LEMMA: "Indiana"},
-    {ORTH: "Jan.", LEMMA: "January"},
-    {ORTH: "Jul.", LEMMA: "July"},
-    {ORTH: "Jun.", LEMMA: "June"},
-    {ORTH: "Kan.", LEMMA: "Kansas"},
-    {ORTH: "Kans.", LEMMA: "Kansas"},
-    {ORTH: "Ky.", LEMMA: "Kentucky"},
-    {ORTH: "La.", LEMMA: "Louisiana"},
-    {ORTH: "Mar.", LEMMA: "March"},
-    {ORTH: "Mass.", LEMMA: "Massachusetts"},
-    {ORTH: "May.", LEMMA: "May"},
-    {ORTH: "Mich.", LEMMA: "Michigan"},
-    {ORTH: "Minn.", LEMMA: "Minnesota"},
-    {ORTH: "Miss.", LEMMA: "Mississippi"},
-    {ORTH: "N.C.", LEMMA: "North Carolina"},
-    {ORTH: "N.D.", LEMMA: "North Dakota"},
-    {ORTH: "N.H.", LEMMA: "New Hampshire"},
-    {ORTH: "N.J.", LEMMA: "New Jersey"},
-    {ORTH: "N.M.", LEMMA: "New Mexico"},
-    {ORTH: "N.Y.", LEMMA: "New York"},
-    {ORTH: "Neb.", LEMMA: "Nebraska"},
-    {ORTH: "Nebr.", LEMMA: "Nebraska"},
-    {ORTH: "Nev.", LEMMA: "Nevada"},
-    {ORTH: "Nov.", LEMMA: "November"},
-    {ORTH: "Oct.", LEMMA: "October"},
-    {ORTH: "Okla.", LEMMA: "Oklahoma"},
-    {ORTH: "Ore.", LEMMA: "Oregon"},
-    {ORTH: "Pa.", LEMMA: "Pennsylvania"},
-    {ORTH: "S.C.", LEMMA: "South Carolina"},
-    {ORTH: "Sep.", LEMMA: "September"},
-    {ORTH: "Sept.", LEMMA: "September"},
-    {ORTH: "Tenn.", LEMMA: "Tennessee"},
-    {ORTH: "Va.", LEMMA: "Virginia"},
-    {ORTH: "Wash.", LEMMA: "Washington"},
-    {ORTH: "Wis.", LEMMA: "Wisconsin"}]:
+    {ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
+    {ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
+    {ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
+    {ORTH: "Apr.", LEMMA: "April", NORM: "April"},
+    {ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"},
+    {ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"},
+    {ORTH: "Aug.", LEMMA: "August", NORM: "August"},
+    {ORTH: "Calif.", LEMMA: "California", NORM: "California"},
+    {ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"},
+    {ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"},
+    {ORTH: "Dec.", LEMMA: "December", NORM: "December"},
+    {ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"},
+    {ORTH: "Feb.", LEMMA: "February", NORM: "February"},
+    {ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"},
+    {ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"},
+    {ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"},
+    {ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"},
+    {ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"},
+    {ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"},
+    {ORTH: "Jan.", LEMMA: "January", NORM: "January"},
+    {ORTH: "Jul.", LEMMA: "July", NORM: "July"},
+    {ORTH: "Jun.", LEMMA: "June", NORM: "June"},
+    {ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"},
+    {ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"},
+    {ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"},
+    {ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"},
+    {ORTH: "Mar.", LEMMA: "March", NORM: "March"},
+    {ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"},
+    {ORTH: "May.", LEMMA: "May", NORM: "May"},
+    {ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"},
+    {ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"},
+    {ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"},
+    {ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"},
+    {ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"},
+    {ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"},
+    {ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"},
+    {ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"},
+    {ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"},
+    {ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"},
+    {ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"},
+    {ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"},
+    {ORTH: "Nov.", LEMMA: "November", NORM: "November"},
+    {ORTH: "Oct.", LEMMA: "October", NORM: "October"},
+    {ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"},
+    {ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"},
+    {ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"},
+    {ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"},
+    {ORTH: "Sep.", LEMMA: "September", NORM: "September"},
+    {ORTH: "Sept.", LEMMA: "September", NORM: "September"},
+    {ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
+    {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
+    {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
+    {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
    _exc[exc_data[ORTH]] = [dict(exc_data)]


--- a/spacy/lang/norm_exceptions.py
+++ b/spacy/lang/norm_exceptions.py
@ -0,0 +1,46 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+# These exceptions are used to add NORM values based on a token's ORTH value.
+# Individual languages can also add their own exceptions and overwrite them -
+# for example, British vs. American spelling in English.
+
+# Norms are only set if no alternative is provided in the tokenizer exceptions.
+# Note that this does not change any other token attributes. Its main purpose
+# is to normalise the word representations so that equivalent tokens receive
+# similar representations. For example: $ and € are very different, but they're
+# both currency symbols. By normalising currency symbols to $, all symbols are
+# seen as similar, no matter how common they are in the training data.
+
+
+BASE_NORMS = {
+    "'s": "'s",
+    "'S": "'s",
+    "’s": "'s",
+    "’S": "'s",
+    "’": "'",
+    "‘": "'",
+    "´": "'",
+    "`": "'",
+    "”": '"',
+    "“": '"',
+    "''": '"',
+    "``": '"',
+    "´´": '"',
+    "„": '"',
+    "»": '"',
+    "«": '"',
+    "…": "...",
+    "—": "-",
+    "–": "-",
+    "--": "-",
+    "---": "-",
+    "€": "$",
+    "£": "$",
+    "¥": "$",
+    "฿": "$",
+    "US$": "$",
+    "C$": "$",
+    "A$": "$"
+}
--- a/spacy/tests/serialize/test_serialize_tokenizer.py
+++ b/spacy/tests/serialize/test_serialize_tokenizer.py
@ -0,0 +1,33 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from ...util import get_lang_class
+from ..util import make_tempdir, assert_packed_msg_equal
+
+import pytest
+
+
+def load_tokenizer(b):
+    tok = get_lang_class('en').Defaults.create_tokenizer()
+    tok.from_bytes(b)
+    return tok
+
+
+@pytest.mark.parametrize('text', ["I💜you", "they’re", "“hello”"])
+def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
+    tokenizer = en_tokenizer
+    new_tokenizer = load_tokenizer(tokenizer.to_bytes())
+    assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes())
+    # assert new_tokenizer.to_bytes() == tokenizer.to_bytes()
+    doc1 = tokenizer(text)
+    doc2 = new_tokenizer(text)
+    assert [token.text for token in doc1] == [token.text for token in doc2]
+
+
+def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
+    tokenizer = en_tokenizer
+    with make_tempdir() as d:
+        file_path = d / 'tokenizer'
+        tokenizer.to_disk(file_path)
+        tokenizer_d = en_tokenizer.from_disk(file_path)
+        assert tokenizer.to_bytes() == tokenizer_d.to_bytes()
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -10,6 +10,7 @@ import numpy
 import tempfile
 import shutil
 import contextlib
+import msgpack
 from pathlib import Path


@ -105,3 +106,13 @@ def assert_docs_equal(doc1, doc2):
    assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]
    assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]
    assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]
+
+
+def assert_packed_msg_equal(b1, b2):
+    """Assert that two packed msgpack messages are equal."""
+    msg1 = msgpack.loads(b1, encoding='utf8')
+    msg2 = msgpack.loads(b2, encoding='utf8')
+    assert sorted(msg1.keys()) == sorted(msg2.keys())
+    for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
+        assert k1 == k2
+        assert v1 == v2
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -2,6 +2,7 @@
 # coding: utf8
 from __future__ import unicode_literals

+from collections import OrderedDict
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
 from cymem.cymem cimport Pool
@ -355,14 +356,14 @@ cdef class Tokenizer:
        **exclude: Named attributes to prevent from being serialized.
        RETURNS (bytes): The serialized form of the `Tokenizer` object.
        """
-        serializers = {
-            'vocab': lambda: self.vocab.to_bytes(),
-            'prefix_search': lambda: self.prefix_search.__self__.pattern,
-            'suffix_search': lambda: self.suffix_search.__self__.pattern,
-            'infix_finditer': lambda: self.infix_finditer.__self__.pattern,
-            'token_match': lambda: self.token_match.__self__.pattern,
-            'exceptions': lambda: self._rules
-        }
+        serializers = OrderedDict((
+            ('vocab', lambda: self.vocab.to_bytes()),
+            ('prefix_search', lambda: self.prefix_search.__self__.pattern),
+            ('suffix_search', lambda: self.suffix_search.__self__.pattern),
+            ('infix_finditer', lambda: self.infix_finditer.__self__.pattern),
+            ('token_match', lambda: self.token_match.__self__.pattern),
+            ('exceptions', lambda: OrderedDict(sorted(self._rules.items())))
+        ))
        return util.to_bytes(serializers, exclude)

    def from_bytes(self, bytes_data, **exclude):
@ -372,15 +373,15 @@ cdef class Tokenizer:
        **exclude: Named attributes to prevent from being loaded.
        RETURNS (Tokenizer): The `Tokenizer` object.
        """
-        data = {}
-        deserializers = {
-            'vocab': lambda b: self.vocab.from_bytes(b),
-            'prefix_search': lambda b: data.setdefault('prefix', b),
-            'suffix_search': lambda b: data.setdefault('suffix_search', b),
-            'infix_finditer': lambda b: data.setdefault('infix_finditer', b),
-            'token_match': lambda b: data.setdefault('token_match', b),
-            'exceptions': lambda b: data.setdefault('rules', b)
-        }
+        data = OrderedDict()
+        deserializers = OrderedDict((
+            ('vocab', lambda b: self.vocab.from_bytes(b)),
+            ('prefix_search', lambda b: data.setdefault('prefix', b)),
+            ('suffix_search', lambda b: data.setdefault('suffix_search', b)),
+            ('infix_finditer', lambda b: data.setdefault('infix_finditer', b)),
+            ('token_match', lambda b: data.setdefault('token_match', b)),
+            ('exceptions', lambda b: data.setdefault('rules', b))
+        ))
        msg = util.from_bytes(bytes_data, deserializers, exclude)
        if 'prefix_search' in data:
            self.prefix_search = re.compile(data['prefix_search']).search
@ -392,3 +393,4 @@ cdef class Tokenizer:
            self.token_match = re.compile(data['token_match']).search
        for string, substrings in data.get('rules', {}).items():
            self.add_special_case(string, substrings)
+        return self
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -437,7 +437,8 @@ cdef class Doc:
        """
        def __get__(self):
            if 'sents' in self.user_hooks:
-                return self.user_hooks['sents'](self)
+                yield from self.user_hooks['sents'](self)
+                return

            if not self.is_parsed:
                raise ValueError(
@ -740,7 +741,7 @@ cdef class Doc:
        token.spacy = self.c[end-1].spacy
        for attr_name, attr_value in attributes.items():
            if attr_name == TAG:
-                self.vocab.morphology.assign_tag(token, attr_value) 
+                self.vocab.morphology.assign_tag(token, attr_value)
            else:
                Token.set_struct_attr(token, attr_name, attr_value)
        # Begin by setting all the head indices to absolute token positions
--- a/spacy/util.py
+++ b/spacy/util.py
@ -299,6 +299,22 @@ def compile_infix_regex(entries):
    return re.compile(expression)


+def add_lookups(default_func, *lookups):
+    """Extend an attribute function with special cases. If a word is in the
+    lookups, the value is returned. Otherwise the previous function is used.
+
+    default_func (callable): The default function to execute.
+    *lookups (dict): Lookup dictionary mapping string to attribute value.
+    RETURNS (callable): Lexical attribute getter.
+    """
+    def get_attr(string):
+        for lookup in lookups:
+            if string in lookup:
+                return lookup[string]
+        return default_func(string)
+    return get_attr
+
+
 def update_exc(base_exceptions, *addition_dicts):
    """Update and validate tokenizer exceptions. Will overwrite exceptions.

--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -231,11 +231,13 @@ cdef class Vocab:
            props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
            token = &tokens[i]
            # Set the special tokens up to have arbitrary attributes
-            token.lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
+            lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
+            token.lex = lex
            if attrs.TAG in props:
                self.morphology.assign_tag(token, props[attrs.TAG])
            for attr_id, value in props.items():
                Token.set_struct_attr(token, attr_id, value)
+                Lexeme.set_struct_attr(lex, attr_id, value)
        return tokens

    @property
--- a/website/docs/api/displacy.jade
+++ b/website/docs/api/displacy.jade
@ -205,7 +205,7 @@ p
        +cell #[code arrow_spacing]
        +cell int
        +cell Spacing between arrows in px to avoid overlaps.
-        +cell #[code 20]
+        +cell #[code 20] / #[code 12] (compact)

    +row
        +cell #[code word_spacing]
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@ -64,7 +64,7 @@ p
    doc = nlp(u'Give it back! He pleaded.')
    assert doc[0].text == 'Give'
    assert doc[-1].text == '.'
-    span = doc[1:1]
+    span = doc[1:3]
    assert span.text == 'it back'

 +table(["Name", "Type", "Description"])
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@ -141,7 +141,7 @@ p
                else:
                    tokens.append(substring)
                    substring = ''
-            tokens.extend(suffixes)
+            tokens.extend(reversed(suffixes))
            return tokens

 p
--- a/website/docs/usage/visualizers.jade
+++ b/website/docs/usage/visualizers.jade
@ -59,9 +59,11 @@ p
    |  to customise the layout, for example:

 +aside("Important note")
-    |  There's currently a known issue with the #[code compact] mode for long
-    |  sentences with arrow spacing. If the spacing is larger than the arc
-    |  itself, it'll cause the arc and its label to flip.
+    |  There's currently a known issue with the #[code compact] mode for
+    |  sentences with short arrows and long dependency labels, that causes labels
+    |  longer than the arrow to wrap. So if you come across this problem,
+    |  especially when using custom labels, you'll have to increase the
+    |  #[code distance] setting in the #[code options] to allow longer arcs.

 +table(["Name", "Type", "Description", "Default"])
    +row