From fa7e576c579198072266e43681207c26fcabc954 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 3 Jun 2017 21:52:06 +0200
Subject: [PATCH 1/5] Change order of exception dicts

---
 spacy/lang/de/__init__.py | 2 +-
 spacy/lang/en/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py
index 0a161e80e..b8a7580a0 100644
--- a/spacy/lang/de/__init__.py
+++ b/spacy/lang/de/__init__.py
@@ -20,7 +20,7 @@ class GermanDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'de'
     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
-                                         BASE_NORMS, NORM_EXCEPTIONS)
+                                         NORM_EXCEPTIONS, BASE_NORMS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
     tag_map = dict(TAG_MAP)
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index 3f422b834..a6c216b43 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -21,7 +21,7 @@ class EnglishDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'en'
     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
-                                         BASE_NORMS, NORM_EXCEPTIONS)
+                                         NORM_EXCEPTIONS, BASE_NORMS)
     lex_attr_getters.update(LEX_ATTRS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)

From ec6d2bc81df0f3532ad558fdc2ac99b361ef4ac3 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 3 Jun 2017 22:16:26 +0200
Subject: [PATCH 2/5] Add table of contents mixin

---
 website/_includes/_mixins.jade    | 11 +++++++++++
 website/docs/usage/spacy-101.jade | 29 +++++++++++++----------------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade
index 9de43b092..16514bcda 100644
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@@ -383,3 +383,14 @@ mixin annotation-row(annots, style)
             else
                 +cell=cell
         block
+
+
+//- Table of contents, to be used with +item mixins for links
+    col - [string] width of column (see +grid-col)
+
+mixin table-of-contents(col)
+    +grid-col(col || "half")
+        +infobox
+            +label.o-block-small Table of contents
+            +list("numbers").u-text-small.o-no-block
+                block
diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade
index 55e7a030a..03897600d 100644
--- a/website/docs/usage/spacy-101.jade
+++ b/website/docs/usage/spacy-101.jade
@@ -38,22 +38,19 @@ p
             |  #[strong natural language understanding] systems, or to
             |  pre-process text for #[strong deep learning].
 
-    +grid-col("half")
-        +infobox
-            +label.o-block-small Table of contents
-            +list("numbers").u-text-small.o-no-block
-                +item #[+a("#features") Features]
-                +item #[+a("#annotations") Linguistic annotations]
-                +item #[+a("#annotations-token") Tokenization]
-                +item #[+a("#annotations-pos-deps") POS tags and dependencies]
-                +item #[+a("#annotations-ner") Named entities]
-                +item #[+a("#vectors-similarity") Word vectos and similarity]
-                +item #[+a("#pipelines") Pipelines]
-                +item #[+a("#vocab") Vocab, hashes and lexemes]
-                +item #[+a("#serialization") Serialization]
-                +item #[+a("#training") Training]
-                +item #[+a("#architecture") Architecture]
-                +item #[+a("#community") Community & FAQ]
+    +table-of-contents
+        +item #[+a("#features") Features]
+        +item #[+a("#annotations") Linguistic annotations]
+        +item #[+a("#annotations-token") Tokenization]
+        +item #[+a("#annotations-pos-deps") POS tags and dependencies]
+        +item #[+a("#annotations-ner") Named entities]
+        +item #[+a("#vectors-similarity") Word vectos and similarity]
+        +item #[+a("#pipelines") Pipelines]
+        +item #[+a("#vocab") Vocab, hashes and lexemes]
+        +item #[+a("#serialization") Serialization]
+        +item #[+a("#training") Training]
+        +item #[+a("#architecture") Architecture]
+        +item #[+a("#community") Community & FAQ]
 
 +h(3, "what-spacy-isnt") What spaCy isn't
 

From a3715a81d5a1b9a5309920dd987fd8c167dea689 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 3 Jun 2017 22:16:38 +0200
Subject: [PATCH 3/5] Update adding languages guide

---
 website/docs/usage/adding-languages.jade | 142 ++++++++++++++++++-----
 1 file changed, 115 insertions(+), 27 deletions(-)

diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade
index 005c4e750..c900734d4 100644
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@@ -3,32 +3,51 @@
 include ../../_includes/_mixins
 
 p
-    |  Adding full support for a language touches many different parts of the
-    |  spaCy library. This guide explains how to fit everything together, and
-    |  points you to the specific workflows for each component. Obviously,
-    |  there are lots of ways you can organise your code when you implement
-    |  your own #[+api("language") #[code Language]] class. This guide will
-    |  focus on how it's done within spaCy. For full language support, we'll
-    |  need to:
+        |  Adding full support for a language touches many different parts of the
+        |  spaCy library. This guide explains how to fit everything together, and
+        |  points you to the specific workflows for each component.
 
-+list("numbers")
-    +item
-        |  Create a #[strong #[code Language] subclass].
-    +item
-        |  Define custom #[strong language data], like a stop list and tokenizer
-        |  exceptions.
-    +item
-        |  #[strong Test] the new language tokenizer.
-    +item
-        |  #[strong Build the vocabulary], including word frequencies, Brown
-        |  clusters and word vectors.
-    +item
-        |  Set up a #[strong model direcory] and #[strong train] the tagger and
-        |  parser.
++grid.o-no-block
+    +grid-col("half")
+        p
+            |  Obviously, there are lots of ways you can organise your code when
+            |  you implement your own language data. This guide will focus on
+            |  how it's done within spaCy. For full language support, you'll
+            |  need to create a #[code Language] subclass, define custom
+            |  #[strong language data], like a stop list and tokenizer
+            |  exceptions and test the new tokenizer. Once the language is set
+            |  up, you can #[strong build the vocabulary], including word
+            |  frequencies, Brown clusters and word vectors. Finally, you can
+            |  #[strong train the tagger and parser], and save the model to a
+            |  directory.
 
-p
-    |  For some languages, you may also want to develop a solution for
-    |  lemmatization and morphological analysis.
+        p
+            |  For some languages, you may also want to develop a solution for
+            |  lemmatization and morphological analysis.
+
+    +table-of-contents
+        +item #[+a("#language-subclass") The Language subclass]
+        +item #[+a("#language-data") Adding language data]
+        +item #[+a("#stop-workds") Stop words]
+        +item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
+        +item #[+a("#norm-exceptions") Norm exceptions]
+        +item #[+a("#lex-attrs") Lexical attributes]
+        +item #[+a("#lemmatizer") Lemmatizer]
+        +item #[+a("#tag-map") Tag map]
+        +item #[+a("#morph-rules") Morph rules]
+        +item #[+a("#testing") Testing the tokenizer]
+        +item #[+a("#vocabulary") Building the vocabulary]
+        +item #[+a("#training") Training]
+
++aside("Working on spaCy's source")
+    |  To add a new language to spaCy, you'll need to
+    |  #[strong modify the library&apos;s code]. The easiest way to do this is to
+    |  clone the #[+src(gh("spaCy")) repository] and #[strong build spaCy from source].
+    |  For more information on this, see the #[+a("/docs/usage") installation guide].
+    |  Unlike spaCy's core, which is mostly written in Cython, all language
+    |  data is stored in regular Python files. This means that you won't have to
+    |  rebuild anything in between – you can simply make edits and reload spaCy
+    |  to test them.
 
 +h(2, "language-subclass") Creating a #[code Language] subclass
 
@@ -123,6 +142,14 @@ p
             |  Special-case rules for the tokenizer, for example, contractions
             |  and abbreviations containing punctuation.
 
+    +row
+        +cell #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) norm_exceptions.py]
+        +cell
+            |  #[code NORM_EXCEPTIONS] (dict)
+        +cell
+            |  Special-case rules for normalising tokens and assigning norms,
+            |  for example American vs. British spelling.
+
     +row
         +cell #[+src(gh("spaCy", "spacy/lang/punctuation.py")) punctuation.py]
         +cell
@@ -235,7 +262,7 @@ p
     TOKENIZER_EXCEPTIONS = {
         "don't": [
             {ORTH: "do", LEMMA: "do"},
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
+            {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
     }
 
 +infobox("Important note")
@@ -286,7 +313,7 @@ p
 p
     |  When adding the tokenizer exceptions to the #[code Defaults], you can use
     |  the #[+api("util#update_exc") #[code update_exc()]] helper function to merge
-    |  them with the global base  exceptions (including one-letter abbreviations
+    |  them with the global base exceptions (including one-letter abbreviations
     |  and emoticons). The function performs a basic check to make sure
     |  exceptions are provided in the correct format. It can take any number of
     |  exceptions dicts as its arguments, and will update and overwrite the
@@ -303,13 +330,74 @@ p
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
     # {"a.": [{ORTH: "a.", LEMMA: "all"}], ":)": [{ORTH: ":)"}]}
 
-//-+aside("About spaCy's custom pronoun lemma")
++infobox("About spaCy's custom pronoun lemma")
     |  Unlike verbs and common nouns, there's no clear base form of a personal
     |  pronoun. Should the lemma of "me" be "I", or should we normalize person
     |  as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
     |  novel symbol, #[code.u-nowrap -PRON-], which is used as the lemma for
     |  all personal pronouns.
 
++h(3, "norm-exceptions") Norm exceptions
+
+p
+    |  In addition to #[code ORTH] or #[code LEMMA], tokenizer exceptions can
+    |  also set a #[code NORM] attribute. This is useful to specify a normalised
+    |  version of the token – for example, the norm of "n't" is "not". By default,
+    |  a token's norm equals its lowercase text. If the lowercase spelling of a
+    |  word exists, norms should always be in lowercase.
+
++aside-code("Accessing norms").
+    doc = nlp(u"I can't")
+    assert [t.norm_ for t in doc] == ['i', 'can', 'not']
+
+p
+    |  spaCy usually tries to normalise words with different spellings to a single,
+    |  common spelling. This has no effect on any other token attributes, or
+    |  tokenization in general, but it ensures that
+    |  #[strong equivalent tokens receive similar representations]. This can
+    |  improve the model's predictions on words that weren't common in the
+    |  training data, but are equivalent to other words – for example, "realize"
+    |  and "realise", or "thx" and "thanks".
+
+p
+    |  Similarly, spaCy also includes
+    |  #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) global base norms]
+    |  for normalising different styles of quotation marks and currency
+    |  symbols. Even though #[code $] and #[code €] are very different, spaCy
+    |  normalises them both to #[code $]. This way, they'll always be seen as
+    |  similar, no matter how common they were in the training data.
+
+p
+    |  Norm exceptions can be provided as a simple dictionary. For more examples,
+    |  see the English
+    |  #[+src(gh("spaCy", "spacy/lang/en/norm_exceptions.py")) norm_exceptions.py].
+
++code("Example").
+    NORM_EXCEPTIONS = {
+        "cos": "because",
+        "fav": "favorite",
+        "accessorise": "accessorize",
+        "accessorised": "accessorized"
+    }
+
+p
+    |  To add the custom norm exceptions lookup table, you can use the
+    |  #[code add_lookups()] helper functions. It takes the default attribute
+    |  getter function as its first argument, plus a variable list of
+    |  dictionaries. If a string's norm is found in one of the dictionaries,
+    |  that value is used – otherwise, the default function is called and the
+    |  token is assigned its default norm.
+
++code.
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
+                                         NORM_EXCEPTIONS, BASE_NORMS)
+
+p
+    |  The order of the dictionaries is also the lookup order – so if your
+    |  language's norm exceptions overwrite any of the global exceptions, they
+    |  should be added first. Also note that the tokenizer exceptions will
+    |  always have priority over the atrribute getters.
+
 +h(3, "lex-attrs") Lexical attributes
 
 p

From 4c643d74c5a1a873e0a345f158f587b8f322f85c Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 3 Jun 2017 22:29:21 +0200
Subject: [PATCH 4/5] Add norm exceptions to other Language classes

---
 spacy/lang/da/__init__.py | 6 ++++--
 spacy/lang/es/__init__.py | 6 ++++--
 spacy/lang/fi/__init__.py | 6 ++++--
 spacy/lang/fr/__init__.py | 6 ++++--
 spacy/lang/hu/__init__.py | 6 ++++--
 spacy/lang/it/__init__.py | 6 ++++--
 spacy/lang/nb/__init__.py | 6 ++++--
 spacy/lang/nl/__init__.py | 6 ++++--
 spacy/lang/pl/__init__.py | 6 ++++--
 spacy/lang/pt/__init__.py | 6 ++++--
 spacy/lang/sv/__init__.py | 6 ++++--
 spacy/lang/xx/__init__.py | 6 ++++--
 12 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py
index b9e90dc0d..99babdc2c 100644
--- a/spacy/lang/da/__init__.py
+++ b/spacy/lang/da/__init__.py
@@ -5,14 +5,16 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
 
 
 class DanishDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'da'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
     stop_words = set(STOP_WORDS)
diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py
index 8291b2dd0..e20338b39 100644
--- a/spacy/lang/es/__init__.py
+++ b/spacy/lang/es/__init__.py
@@ -7,15 +7,17 @@ from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
 
 
 class SpanishDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'es'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
     tag_map = dict(TAG_MAP)
diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py
index 7010acd48..931ad5341 100644
--- a/spacy/lang/fi/__init__.py
+++ b/spacy/lang/fi/__init__.py
@@ -5,14 +5,16 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
 
 
 class FinnishDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'fi'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
     stop_words = set(STOP_WORDS)
diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index f9a01f223..e8c13777f 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -7,15 +7,17 @@ from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
 
 
 class FrenchDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'fr'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
     stop_words = set(STOP_WORDS)
diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py
index 70b4ae5cc..0fe6a9f5c 100644
--- a/spacy/lang/hu/__init__.py
+++ b/spacy/lang/hu/__init__.py
@@ -7,15 +7,17 @@ from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
 
 
 class HungarianDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'hu'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
     stop_words = set(STOP_WORDS)
diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
index 573a8df16..7cc717cb3 100644
--- a/spacy/lang/it/__init__.py
+++ b/spacy/lang/it/__init__.py
@@ -5,15 +5,17 @@ from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
 
 
 class ItalianDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'it'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
     stop_words = set(STOP_WORDS)
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index cb2baf148..c1b4af263 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -6,14 +6,16 @@ from .stop_words import STOP_WORDS
 from .morph_rules import MORPH_RULES
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
 
 
 class NorwegianDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'nb'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
     stop_words = set(STOP_WORDS)
diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py
index d6430d0b3..7b948f295 100644
--- a/spacy/lang/nl/__init__.py
+++ b/spacy/lang/nl/__init__.py
@@ -4,14 +4,16 @@ from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
 
 
 class DutchDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'nl'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
     stop_words = set(STOP_WORDS)
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 535120874..067646dbd 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -4,14 +4,16 @@ from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
 
 
 class PolishDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'pl'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
     stop_words = set(STOP_WORDS)
diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py
index df6b76c7a..67539034d 100644
--- a/spacy/lang/pt/__init__.py
+++ b/spacy/lang/pt/__init__.py
@@ -7,15 +7,17 @@ from .lex_attrs import LEX_ATTRS
 from .lemmatizer import LOOKUP
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
 
 
 class PortugueseDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'pt'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
     lex_attr_getters.update(LEX_ATTRS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index b309643f7..2d3a640c5 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -7,15 +7,17 @@ from .morph_rules import MORPH_RULES
 from .lemmatizer import LEMMA_RULES, LOOKUP
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
 
 
 class SwedishDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'sv'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
     stop_words = set(STOP_WORDS)
diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py
index fef8c9d59..dc63ee33f 100644
--- a/spacy/lang/xx/__init__.py
+++ b/spacy/lang/xx/__init__.py
@@ -3,14 +3,16 @@ from __future__ import unicode_literals
 
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
 
 
 class MultiLanguageDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda text: 'xx'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
 
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
 

From 8a17b99b1c1107a632729fccf8c558faf2f764b6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 3 Jun 2017 15:30:16 -0500
Subject: [PATCH 5/5] Use NORM attribute, not LOWER

---
 spacy/_ml.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index c499a5cff..6d02dfd27 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -13,7 +13,7 @@ from thinc import describe
 from thinc.describe import Dimension, Synapses, Biases, Gradient
 from thinc.neural._classes.affine import _set_dimensions_if_needed
 
-from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP
+from .attrs import ID, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
 from .tokens.doc import Doc
 
 import numpy
@@ -131,14 +131,14 @@ class PrecomputableMaxouts(Model):
         return Yfp, backward
 
 def Tok2Vec(width, embed_size, preprocess=None):
-    cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE]
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
-        lower = get_col(cols.index(LOWER))   >> HashEmbed(width, embed_size, name='embed_lower')
+        norm = get_col(cols.index(NORM))   >> HashEmbed(width, embed_size, name='embed_lower')
         prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
         suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
         shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape')
 
-        embed = (lower | prefix | suffix | shape )
+        embed = (norm | prefix | suffix | shape )
         tok2vec = (
             with_flatten(
                 asarray(Model.ops, dtype='uint64')
@@ -148,7 +148,7 @@ def Tok2Vec(width, embed_size, preprocess=None):
                 >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
                 >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
                 >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
-            pad=4, ndim=5)
+            pad=4)
         )
         if preprocess not in (False, None):
             tok2vec = preprocess >> tok2vec
@@ -243,7 +243,7 @@ def zero_init(model):
 
 
 def doc2feats(cols=None):
-    cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE]
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
     def forward(docs, drop=0.):
         feats = []
         for doc in docs: