diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 06e0d5f72..2f699ecd2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -33,6 +33,7 @@ We use the following system to tag our issues:
 | [`install`](https://github.com/explosion/spaCy/labels/install) | Installation problems |
 | [`performance`](https://github.com/explosion/spaCy/labels/performance) | Accuracy, speed and memory use problems |
 | [`tests`](https://github.com/explosion/spaCy/labels/tests) | Missing or incorrect [tests](spacy/tests) |
+| [`english`](https://github.com/explosion/spaCy/labels/english), [`german`](https://github.com/explosion/spaCy/labels/german) | Issues related to the specific languages, models and data |
 | [`linux`](https://github.com/explosion/spaCy/labels/linux), [`osx`](https://github.com/explosion/spaCy/labels/osx), [`windows`](https://github.com/explosion/spaCy/labels/windows) | Issues related to the specific operating systems |
 | [`pip`](https://github.com/explosion/spaCy/labels/pip), [`conda`](https://github.com/explosion/spaCy/labels/conda) | Issues related to the specific package managers |
 | [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before |
diff --git a/README.rst b/README.rst
index c48c3479b..6e03aa0c4 100644
--- a/README.rst
+++ b/README.rst
@@ -3,8 +3,10 @@ spaCy: Industrial-strength NLP
 
 spaCy is a library for advanced natural language processing in Python and 
 Cython. spaCy is built on  the very latest research, but it isn't researchware.  
-It was designed from day 1 to be used in real products. It's commercial 
-open-source software, released under the MIT license.
+It was designed from day one to be used in real products. spaCy currently supports 
+English and German,  as well as tokenization for Chinese, Spanish, Italian, French, 
+Portuguese, Dutch, Swedish and Hungarian. It's commercial  open-source software, 
+released under the MIT license.
 
 💫 **Version 1.5 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
 
@@ -24,7 +26,7 @@ open-source software, released under the MIT license.
     :target: https://pypi.python.org/pypi/spacy
     :alt: pypi Version
 
-.. image:: https://badges.gitter.im/spaCy-users.png
+.. image:: https://badges.gitter.im/explosion.png
     :target: https://gitter.im/explosion/spaCy
     :alt: spaCy on Gitter
 
diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py
index 62256cc14..565eab37f 100644
--- a/bin/parser/train_ud.py
+++ b/bin/parser/train_ud.py
@@ -71,6 +71,8 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc):
     features = get_templates('basic')
     
     model_dir = pathlib.Path(model_dir)
+    if not (model_dir / 'deps').exists():
+        (model_dir / 'deps').mkdir()
     with (model_dir / 'deps' / 'config.json').open('w') as file_:
         json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_)
 
diff --git a/setup.py b/setup.py
index 2a1d56a5e..4ba997a0c 100644
--- a/setup.py
+++ b/setup.py
@@ -47,8 +47,7 @@ PACKAGES = [
     'spacy.tests.tokenizer',
     'spacy.tests.tokens',
     'spacy.tests.vectors',
-    'spacy.tests.vocab',
-    'spacy.tests.website']
+    'spacy.tests.vocab']
 
 
 MOD_NAMES = [
diff --git a/spacy/de/language_data.py b/spacy/de/language_data.py
index f64c915f6..5e09c0eb3 100644
--- a/spacy/de/language_data.py
+++ b/spacy/de/language_data.py
@@ -9,12 +9,13 @@ from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
 
 
-TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 TAG_MAP = dict(TAG_MAP)
 STOP_WORDS = set(STOP_WORDS)
 
 
+TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
 
 
diff --git a/spacy/de/tokenizer_exceptions.py b/spacy/de/tokenizer_exceptions.py
index b0561a223..0d8dc54e8 100644
--- a/spacy/de/tokenizer_exceptions.py
+++ b/spacy/de/tokenizer_exceptions.py
@@ -516,11 +516,6 @@ TOKENIZER_EXCEPTIONS = {
 
 
 ORTH_ONLY = [
-    "'",
-    "\\\")",
-    "<space>",
-    "a.",
-    "ä.",
     "A.C.",
     "a.D.",
     "A.D.",
@@ -530,24 +525,20 @@ ORTH_ONLY = [
     "Abs.",
     "adv.",
     "al.",
-    "b.",
     "B.A.",
     "B.Sc.",
     "betr.",
     "biol.",
     "Biol.",
-    "c.",
     "ca.",
     "Chr.",
     "Cie.",
     "co.",
     "Co.",
-    "d.",
     "D.C.",
     "Dipl.-Ing.",
     "Dipl.",
     "Dr.",
-    "e.",
     "e.g.",
     "e.V.",
     "ehem.",
@@ -555,79 +546,57 @@ ORTH_ONLY = [
     "erm.",
     "etc.",
     "ev.",
-    "f.",
-    "g.",
     "G.m.b.H.",
     "geb.",
     "Gebr.",
     "gem.",
-    "h.",
     "h.c.",
     "Hg.",
     "hrsg.",
     "Hrsg.",
-    "i.",
     "i.A.",
     "i.e.",
     "i.G.",
     "i.Tr.",
     "i.V.",
     "Ing.",
-    "j.",
     "jr.",
     "Jr.",
     "jun.",
     "jur.",
-    "k.",
     "K.O.",
-    "l.",
     "L.A.",
     "lat.",
-    "m.",
     "M.A.",
     "m.E.",
     "m.M.",
     "M.Sc.",
     "Mr.",
-    "n.",
     "N.Y.",
     "N.Y.C.",
     "nat.",
     "ö."
-    "o.",
     "o.a.",
     "o.ä.",
     "o.g.",
     "o.k.",
     "O.K.",
-    "p.",
     "p.a.",
     "p.s.",
     "P.S.",
     "pers.",
     "phil.",
-    "q.",
     "q.e.d.",
-    "r.",
     "R.I.P.",
     "rer.",
-    "s.",
     "sen.",
     "St.",
     "std.",
-    "t.",
-    "u.",
-    "ü.",
     "u.a.",
     "U.S.",
     "U.S.A.",
     "U.S.S.",
-    "v.",
     "Vol.",
     "vs.",
-    "w.",
-    "wiss.",
-    "x.",
-    "y.",
-    "z."
+    "wiss."
 ]
diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py
index a75f2b9d5..1fcbf277e 100644
--- a/spacy/en/language_data.py
+++ b/spacy/en/language_data.py
@@ -37,14 +37,16 @@ def get_time_exc(hours):
     return exc
 
 
-TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 TAG_MAP = dict(TAG_MAP)
 STOP_WORDS = set(STOP_WORDS)
 
 
+TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
 update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
 update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
 
 __all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"]
diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py
index 398ae486b..38fc33cfb 100644
--- a/spacy/en/tokenizer_exceptions.py
+++ b/spacy/en/tokenizer_exceptions.py
@@ -5,935 +5,355 @@ from ..symbols import *
 from ..language_data import PRON_LEMMA
 
 
-TOKENIZER_EXCEPTIONS = {
-    "and/or": [
-        {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"}
-    ],
+EXC = {}
 
-    "Theydve": [
-        {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
+EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Well", "well", "Whore", "whore"]
 
-    "shouldn't've": [
-        {ORTH: "should"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
 
-    "There'll": [
-        {ORTH: "There"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
+# Pronouns
 
-    "howll": [
-        {ORTH: "how"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
+for pron in ["i"]:
+    for orth in [pron, pron.title()]:
+        EXC[orth + "'m"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}
+        ]
 
-    "Hadn't've": [
-        {ORTH: "Had", LEMMA: "have", TAG: "VBD"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
+        EXC[orth + "m"] = [
+            {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }
+        ]
 
-    "who'll": [
-        {ORTH: "who"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
+        EXC[orth + "'ma"] = [
+            {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "'m", LEMMA: "be", NORM: "am"},
+            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}
+        ]
 
-    "aint": [
-        {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
+        EXC[orth + "ma"] = [
+            {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "m", LEMMA: "be", NORM: "am"},
+            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}
+        ]
 
+
+for pron in ["i", "you", "he", "she", "it", "we", "they"]:
+    for orth in [pron, pron.title()]:
+        EXC[orth + "'ll"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
+        ]
+
+        EXC[orth + "ll"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "ll", LEMMA: "will", TAG: "MD"}
+        ]
+
+        EXC[orth + "'ll've"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+        EXC[orth + "llve"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+        EXC[orth + "'d"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "'d", LEMMA: "would", TAG: "MD"}
+        ]
+
+        EXC[orth + "d"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "d", LEMMA: "would", TAG: "MD"}
+        ]
+
+        EXC[orth + "'d've"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "'d", LEMMA: "would", TAG: "MD"},
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+        EXC[orth + "dve"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "d", LEMMA: "would", TAG: "MD"},
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+
+for pron in ["i", "you", "we", "they"]:
+    for orth in [pron, pron.title()]:
+        EXC[orth + "'ve"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+        EXC[orth + "ve"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+
+for pron in ["you", "we", "they"]:
+    for orth in [pron, pron.title()]:
+        EXC[orth + "'re"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "'re", LEMMA: "be", NORM: "are"}
+        ]
+
+        EXC[orth + "re"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "re", LEMMA: "be", NORM: "are"}
+        ]
+
+
+for pron in ["he", "she", "it"]:
+    for orth in [pron, pron.title()]:
+        EXC[orth + "'s"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "'s"}
+        ]
+
+        EXC[orth + "s"] = [
+            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: "s"}
+        ]
+
+
+
+# W-words, relative pronouns, prepositions etc.
+
+for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
+    for orth in [word, word.title()]:
+        EXC[orth + "'s"] = [
+            {ORTH: orth, LEMMA: word},
+            {ORTH: "'s"}
+        ]
+
+        EXC[orth + "s"] = [
+            {ORTH: orth, LEMMA: word},
+            {ORTH: "s"}
+        ]
+
+        EXC[orth + "'ll"] = [
+            {ORTH: orth, LEMMA: word},
+            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
+        ]
+
+        EXC[orth + "ll"] = [
+            {ORTH: orth, LEMMA: word},
+            {ORTH: "ll", LEMMA: "will", TAG: "MD"}
+        ]
+
+        EXC[orth + "'ll've"] = [
+            {ORTH: orth, LEMMA: word},
+            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+        EXC[orth + "llve"] = [
+            {ORTH: orth, LEMMA: word},
+            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+        EXC[orth + "'re"] = [
+            {ORTH: orth, LEMMA: word},
+            {ORTH: "'re", LEMMA: "be", NORM: "are"}
+        ]
+
+        EXC[orth + "re"] = [
+            {ORTH: orth, LEMMA: word},
+            {ORTH: "re", LEMMA: "be", NORM: "are"}
+        ]
+
+        EXC[orth + "'ve"] = [
+            {ORTH: orth},
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+        EXC[orth + "ve"] = [
+            {ORTH: orth, LEMMA: word},
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+        EXC[orth + "'d"] = [
+            {ORTH: orth, LEMMA: word},
+            {ORTH: "'d"}
+        ]
+
+        EXC[orth + "d"] = [
+            {ORTH: orth, LEMMA: word},
+            {ORTH: "d"}
+        ]
+
+        EXC[orth + "'d've"] = [
+            {ORTH: orth, LEMMA: word},
+            {ORTH: "'d", LEMMA: "would", TAG: "MD"},
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+        EXC[orth + "dve"] = [
+            {ORTH: orth, LEMMA: word},
+            {ORTH: "d", LEMMA: "would", TAG: "MD"},
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+
+# Verbs
+
+for verb_data in [
+    {ORTH: "ca", LEMMA: "can", TAG: "MD"},
+    {ORTH: "could", TAG: "MD"},
+    {ORTH: "do", LEMMA: "do"},
+    {ORTH: "does", LEMMA: "do"},
+    {ORTH: "did", LEMMA: "do", TAG: "VBD"},
+    {ORTH: "had", LEMMA: "have", TAG: "VBD"},
+    {ORTH: "may"},
+    {ORTH: "might"},
+    {ORTH: "must"},
+    {ORTH: "need"},
+    {ORTH: "ought"},
+    {ORTH: "sha", LEMMA: "shall"},
+    {ORTH: "should"},
+    {ORTH: "wo", LEMMA: "will"},
+    {ORTH: "would"}
+]:
+    verb_data_tc = dict(verb_data)
+    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
+
+    for data in [verb_data, verb_data_tc]:
+        EXC[data[ORTH] + "n't"] = [
+            dict(data),
+            {ORTH: "n't", LEMMA: "not", TAG: "RB"}
+        ]
+
+        EXC[data[ORTH] + "nt"] = [
+            dict(data),
+            {ORTH: "nt", LEMMA: "not", TAG: "RB"}
+        ]
+
+        EXC[data[ORTH] + "n't've"] = [
+            {ORTH: "n't", LEMMA: "not", TAG: "RB"},
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+        EXC[data[ORTH] + "ntve"] = [
+            {ORTH: "nt", LEMMA: "not", TAG: "RB"},
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+
+for verb_data in [
+    {ORTH: "could", TAG: "MD"},
+    {ORTH: "might"},
+    {ORTH: "must"},
+    {ORTH: "should"}
+]:
+    verb_data_tc = dict(verb_data)
+    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
+
+    for data in [verb_data, verb_data_tc]:
+        EXC[data[ORTH] + "'ve"] = [
+            dict(data),
+            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+        EXC[data[ORTH] + "ve"] = [
+            dict(data),
+            {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+        ]
+
+
+for verb_data in [
+    {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"},
+    {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
+    {ORTH: "is", LEMMA: "be", TAG: "VBZ"},
+    {ORTH: "was", LEMMA: "be"},
+    {ORTH: "were", LEMMA: "be"}
+]:
+    verb_data_tc = dict(verb_data)
+    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
+
+    for data in [verb_data, verb_data_tc]:
+        EXC[data[ORTH] + "n't"] = [
+            dict(data),
+            {ORTH: "n't", LEMMA: "not", TAG: "RB"}
+        ]
+
+        EXC[data[ORTH] + "nt"] = [
+            dict(data),
+            {ORTH: "nt", LEMMA: "not", TAG: "RB"}
+        ]
+
+
+
+# Other contractions with trailing apostrophe
+
+for exc_data in [
+    {ORTH: "doin", LEMMA: "do", NORM: "doing"},
+    {ORTH: "goin", LEMMA: "go", NORM: "going"},
+    {ORTH: "nothin", LEMMA: "nothing"},
+    {ORTH: "nuthin", LEMMA: "nothing"},
+    {ORTH: "ol", LEMMA: "old"},
+    {ORTH: "somethin", LEMMA: "something"}
+]:
+    exc_data_tc = dict(exc_data)
+    exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
+
+    for data in [exc_data, exc_data_tc]:
+        data_apos = dict(data)
+        data_apos[ORTH] = data_apos[ORTH] + "'"
+
+        EXC[data[ORTH]] = [
+            dict(data)
+        ]
+
+        EXC[data_apos[ORTH]] = [
+            dict(data_apos)
+        ]
+
+
+# Other contractions with leading apostrophe
+
+for exc_data in [
+    {ORTH: "cause", LEMMA: "because"},
+    {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
+    {ORTH: "ll", LEMMA: "will"},
+    {ORTH: "nuff", LEMMA: "enough"}
+]:
+    exc_data_apos = dict(exc_data)
+    exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
+
+    for data in [exc_data, exc_data_apos]:
+        EXC[data[ORTH]] = [
+            dict(data)
+        ]
+
+
+# Rest
+
+OTHER = {
     " ": [
-        {TAG: "SP", ORTH: " "}
-    ],
-
-    "Shouldnt": [
-        {ORTH: "Should"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "when's": [
-        {ORTH: "when"},
-        {ORTH: "'s", LEMMA: "be"}
-    ],
-
-    "Didnt": [
-        {ORTH: "Did", LEMMA: "do", TAG: "VBD"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "itll": [
-        {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "Who're": [
-        {ORTH: "Who"},
-        {ORTH: "'re", LEMMA: "be"}
-    ],
-
-    "Ain't": [
-        {ORTH: "Ai", TAG: "VBP", "number": 2, LEMMA: "be"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Can't": [
-        {ORTH: "Ca", LEMMA: "can", TAG: "MD"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Whyre": [
-        {ORTH: "Why"},
-        {ORTH: "re"}
-    ],
-
-    "Aren't": [
-        {ORTH: "Are", TAG: "VBP", "number": 2, LEMMA: "be"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Neednt": [
-        {ORTH: "Need"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "should've": [
-        {ORTH: "should"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "shouldn't": [
-        {ORTH: "should"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Idve": [
-        {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "weve": [
-        {ORTH: "we"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Ive": [
-        {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "they'd": [
-        {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Youdve": [
-        {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "theyve": [
-        {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Weren't": [
-        {ORTH: "Were"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "werent": [
-        {ORTH: "were"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "whyre": [
-        {ORTH: "why"},
-        {ORTH: "re"}
-    ],
-
-    "I'm": [
-        {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"}
-    ],
-
-    "She'd've": [
-        {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "not've": [
-        {ORTH: "not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "we'll": [
-        {ORTH: "we"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "Don't": [
-        {ORTH: "Do", LEMMA: "do"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Whyll": [
-        {ORTH: "Why"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "they've": [
-        {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "wasn't": [
-        {ORTH: "was"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "could've": [
-        {ORTH: "could", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "what've": [
-        {ORTH: "what"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "havent": [
-        {ORTH: "have", TAG: "VB"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Who've": [
-        {ORTH: "Who"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Shan't": [
-        {ORTH: "Sha", LEMMA: "shall"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "i'll": [
-        {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "you'd": [
-        {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "whens": [
-        {ORTH: "when"},
-        {ORTH: "s", LEMMA: "be"}
-    ],
-
-    "whys": [
-        {ORTH: "why"},
-        {ORTH: "s"}
-    ],
-
-    "Whereve": [
-        {ORTH: "Where"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+        {ORTH: " ", TAG: "SP"}
     ],
 
     "\u00a0": [
         {ORTH: "\u00a0", TAG: "SP", LEMMA: "  "}
     ],
 
-    "there'd": [
-        {ORTH: "there"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "hadn't've": [
-        {ORTH: "had", LEMMA: "have", TAG: "VBD"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "whatll": [
-        {ORTH: "what"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "wouldn't've": [
-        {ORTH: "would"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "there's": [
-        {ORTH: "there"},
-        {ORTH: "'s"}
-    ],
-
-    "Who'll": [
-        {ORTH: "Who"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "youll": [
-        {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "wouldve": [
-        {ORTH: "would"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Wouldnt": [
-        {ORTH: "Would"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Thered": [
-        {ORTH: "There"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Youre": [
-        {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "re", LEMMA: "be"}
-    ],
-
-    "Couldn't've": [
-        {ORTH: "Could", TAG: "MD"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "who're": [
-        {ORTH: "who"},
-        {ORTH: "'re", LEMMA: "be"}
-    ],
-
-    "Whys": [
-        {ORTH: "Why"},
-        {ORTH: "s"}
-    ],
-
-    "mightn't've": [
-        {ORTH: "might"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Wholl": [
-        {ORTH: "Who"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "hadn't": [
-        {ORTH: "had", LEMMA: "have", TAG: "VBD"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Havent": [
-        {ORTH: "Have", TAG: "VB"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Whatve": [
-        {ORTH: "What"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Thats": [
-        {ORTH: "That"},
-        {ORTH: "s"}
-    ],
-
-    "Howll": [
-        {ORTH: "How"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "wouldn't": [
-        {ORTH: "would"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "You'll": [
-        {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "Cant": [
-        {ORTH: "Ca", LEMMA: "can", TAG: "MD"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "i'd": [
-        {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "weren't": [
-        {ORTH: "were"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "would've": [
-        {ORTH: "would"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "i'm": [
-        {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"}
-    ],
-
-    "why'll": [
-        {ORTH: "why"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "we'd've": [
-        {ORTH: "we"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Shouldve": [
-        {ORTH: "Should"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "can't": [
-        {ORTH: "ca", LEMMA: "can", TAG: "MD"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "thats": [
-        {ORTH: "that"},
-        {ORTH: "s"}
-    ],
-
-    "Hes": [
-        {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "s"}
-    ],
-
-    "Needn't": [
-        {ORTH: "Need"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "It's": [
-        {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'s"}
-    ],
-
-    "Why're": [
-        {ORTH: "Why"},
-        {ORTH: "'re", LEMMA: "be"}
-    ],
-
-    "Hed": [
-        {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Mt.": [
-        {ORTH: "Mt.", LEMMA: "Mount"}
-    ],
-
-    "couldn't": [
-        {ORTH: "could", TAG: "MD"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "What've": [
-        {ORTH: "What"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "It'd": [
-        {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "theydve": [
-        {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "aren't": [
-        {ORTH: "are", TAG: "VBP", "number": 2, LEMMA: "be"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Mightn't": [
-        {ORTH: "Might"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
     "'S": [
         {ORTH: "'S", LEMMA: "'s"}
     ],
 
-    "I've": [
-        {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
+    "'s": [
+        {ORTH: "'s", LEMMA: "'s"}
     ],
 
-    "Whered": [
-        {ORTH: "Where"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Itdve": [
-        {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "I'ma": [
-        {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ma"}
-    ],
-
-    "whos": [
-        {ORTH: "who"},
-        {ORTH: "s"}
-    ],
-
-    "They'd": [
-        {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "What'll": [
-        {ORTH: "What"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "You've": [
-        {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Mustve": [
-        {ORTH: "Must"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "whod": [
-        {ORTH: "who"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "mightntve": [
-        {ORTH: "might"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "I'd've": [
-        {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Must've": [
-        {ORTH: "Must"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "it'd": [
-        {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "what're": [
-        {ORTH: "what"},
-        {ORTH: "'re", LEMMA: "be", NORM: "are"}
-    ],
-
-    "Wasn't": [
-        {ORTH: "Was"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "what's": [
-        {ORTH: "what"},
-        {ORTH: "'s"}
-    ],
-
-    "he'd've": [
-        {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "She'd": [
-        {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "shedve": [
-        {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "ain't": [
-        {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "She's": [
-        {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'s"}
-    ],
-
-    "i'd've": [
-        {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "We'd've": [
-        {ORTH: "We"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "must've": [
-        {ORTH: "must"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "That's": [
-        {ORTH: "That"},
-        {ORTH: "'s"}
-    ],
-
-    "whatre": [
-        {ORTH: "what"},
-        {ORTH: "re"}
-    ],
-
-    "you'd've": [
-        {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Dont": [
-        {ORTH: "Do", LEMMA: "do"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "thered": [
-        {ORTH: "there"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Youd": [
-        {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "couldn't've": [
-        {ORTH: "could", TAG: "MD"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Whens": [
-        {ORTH: "When"},
-        {ORTH: "s"}
-    ],
-
-    "Isnt": [
-        {ORTH: "Is", LEMMA: "be", TAG: "VBZ"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "mightve": [
-        {ORTH: "might"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "didnt": [
-        {ORTH: "did", LEMMA: "do", TAG: "VBD"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "ive": [
-        {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "It'd've": [
-        {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "\t": [
-        {ORTH: "\t", TAG: "SP"}
-    ],
-
-    "Itll": [
-        {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "didn't": [
-        {ORTH: "did", LEMMA: "do", TAG: "VBD"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "cant": [
-        {ORTH: "ca", LEMMA: "can", TAG: "MD"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "im": [
-        {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"}
-    ],
-
-    "they'd've": [
-        {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Hadntve": [
-        {ORTH: "Had", LEMMA: "have", TAG: "VBD"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Weve": [
-        {ORTH: "We"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Mightnt": [
-        {ORTH: "Might"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "youdve": [
-        {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Shedve": [
-        {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "theyd": [
-        {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Cannot": [
-        {ORTH: "Can", LEMMA: "can", TAG: "MD"},
-        {ORTH: "not", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Hadn't": [
-        {ORTH: "Had", LEMMA: "have", TAG: "VBD"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "What're": [
-        {ORTH: "What"},
-        {ORTH: "'re", LEMMA: "be", NORM: "are"}
-    ],
-
-    "He'll": [
-        {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "wholl": [
-        {ORTH: "who"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "They're": [
-        {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'re", LEMMA: "be", NORM: "are"}
-    ],
-
-    "shouldnt": [
-        {ORTH: "should"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "\n": [
-        {ORTH: "\n", TAG: "SP"}
-    ],
-
-    "whered": [
-        {ORTH: "where"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "youve": [
-        {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "notve": [
-        {ORTH: "not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "couldve": [
-        {ORTH: "could", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "mustve": [
-        {ORTH: "must"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Youve": [
-        {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "therell": [
-        {ORTH: "there"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "might've": [
-        {ORTH: "might"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Mustn't": [
-        {ORTH: "Must"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "wheres": [
-        {ORTH: "where"},
-        {ORTH: "s"}
-    ],
-
-    "they're": [
-        {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'re", LEMMA: "be", NORM: "are"}
-    ],
-
-    "idve": [
-        {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "hows": [
-        {ORTH: "how"},
-        {ORTH: "s"}
-    ],
-
-    "youre": [
-        {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "re", LEMMA: "be", NORM: "are"}
-    ],
-
-    "Didn't": [
-        {ORTH: "Did", LEMMA: "do", TAG: "VBD"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Couldve": [
-        {ORTH: "Could", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "cannot": [
-        {ORTH: "can", LEMMA: "can", TAG: "MD"},
-        {ORTH: "not", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Im": [
-        {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be", NORM: "am"}
-    ],
-
-    "howd": [
-        {ORTH: "how"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "you've": [
-        {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "You're": [
-        {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'re", LEMMA: "be", NORM: "are"}
-    ],
-
-    "she'll": [
-        {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "Theyll": [
-        {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "don't": [
-        {ORTH: "do", LEMMA: "do"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "itd": [
-        {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Hedve": [
-        {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "isnt": [
-        {ORTH: "is", LEMMA: "be", TAG: "VBZ"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "won't": [
-        {ORTH: "wo", LEMMA: "will"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "We're": [
-        {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"},
+    "'re": [
         {ORTH: "'re", LEMMA: "be", NORM: "are"}
     ],
 
@@ -945,391 +365,80 @@ TOKENIZER_EXCEPTIONS = {
         {ORTH: "\u2018s", LEMMA: "'s"}
     ],
 
-    "dont": [
-        {ORTH: "do", LEMMA: "do"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
+    "and/or": [
+        {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"}
     ],
 
-    "ima": [
-        {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ma"}
+    "'Cause": [
+        {ORTH: "'Cause", LEMMA: "because"}
     ],
 
-    "Let's": [
-        {ORTH: "Let"},
-        {ORTH: "'s", LEMMA: "us"}
+    "y'all": [
+        {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"},
+        {ORTH: "all"}
     ],
 
-    "he's": [
-        {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'s"}
+    "yall": [
+        {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"},
+        {ORTH: "all"}
     ],
 
-    "we've": [
-        {ORTH: "we"},
+    "ma'am": [
+        {ORTH: "ma'am", LEMMA: "madam"}
+    ],
+
+    "Ma'am": [
+        {ORTH: "Ma'am", LEMMA: "madam"}
+    ],
+
+    "o'clock": [
+        {ORTH: "o'clock", LEMMA: "o'clock"}
+    ],
+
+    "O'clock": [
+        {ORTH: "O'clock", LEMMA: "o'clock"}
+    ],
+
+    "how'd'y": [
+        {ORTH: "how", LEMMA: "how"},
+        {ORTH: "'d", LEMMA: "do"},
+        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}
+    ],
+
+    "How'd'y": [
+        {ORTH: "How", LEMMA: "how"},
+        {ORTH: "'d", LEMMA: "do"},
+        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}
+    ],
+
+    "not've": [
+        {ORTH: "not", LEMMA: "not", TAG: "RB"},
         {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
     ],
 
-    "What's": [
-        {ORTH: "What"},
-        {ORTH: "'s"}
-    ],
-
-    "Who's": [
-        {ORTH: "Who"},
-        {ORTH: "'s"}
-    ],
-
-    "hedve": [
-        {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
+    "notve": [
+        {ORTH: "not", LEMMA: "not", TAG: "RB"},
         {ORTH: "ve", LEMMA: "have", TAG: "VB"}
     ],
 
-    "he'd": [
-        {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "When's": [
-        {ORTH: "When"},
-        {ORTH: "'s"}
-    ],
-
-    "Mightn't've": [
-        {ORTH: "Might"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "We've": [
-        {ORTH: "We"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Couldntve": [
-        {ORTH: "Could", TAG: "MD"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Who'd": [
-        {ORTH: "Who"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "haven't": [
-        {ORTH: "have", TAG: "VB"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "arent": [
-        {ORTH: "are", TAG: "VBP", "number": 2, LEMMA: "be"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "You'd've": [
-        {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Wouldn't": [
-        {ORTH: "Would"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "who's": [
-        {ORTH: "who"},
-        {ORTH: "'s"}
-    ],
-
-    "Mightve": [
-        {ORTH: "Might"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Theredve": [
-        {ORTH: "There"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "theredve": [
-        {ORTH: "there"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "who'd": [
-        {ORTH: "who"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Where's": [
-        {ORTH: "Where"},
-        {ORTH: "'s"}
-    ],
-
-    "wont": [
-        {ORTH: "wo", LEMMA: "will"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "she'd've": [
-        {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Should've": [
-        {ORTH: "Should"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "theyre": [
-        {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "re"}
-    ],
-
-    "Wouldntve": [
-        {ORTH: "Would"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Where've": [
-        {ORTH: "Where"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "mustn't": [
-        {ORTH: "must"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "isn't": [
-        {ORTH: "is", LEMMA: "be", TAG: "VBZ"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Aint": [
-        {ORTH: "Ai", TAG: "VBP", "number": 2, LEMMA: "be"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "why's": [
-        {ORTH: "why"},
-        {ORTH: "'s"}
-    ],
-
-    "There'd": [
-        {ORTH: "There"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "They'll": [
-        {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "how'll": [
-        {ORTH: "how"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "Wedve": [
-        {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "couldntve": [
-        {ORTH: "could", TAG: "MD"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "There's": [
-        {ORTH: "There"},
-        {ORTH: "'s"}
-    ],
-
-    "we'd": [
-        {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Whod": [
-        {ORTH: "Who"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "whatve": [
-        {ORTH: "what"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Wouldve": [
-        {ORTH: "Would"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "there'll": [
-        {ORTH: "there"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "needn't": [
-        {ORTH: "need"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "shouldntve": [
-        {ORTH: "should"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "why're": [
-        {ORTH: "why"},
-        {ORTH: "'re", LEMMA: "be", NORM: "are"}
-    ],
-
-    "Doesnt": [
-        {ORTH: "Does", LEMMA: "do", TAG: "VBZ"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "whereve": [
-        {ORTH: "where"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "they'll": [
-        {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "I'd": [
-        {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Might've": [
-        {ORTH: "Might"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "mightnt": [
-        {ORTH: "might"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
     "Not've": [
         {ORTH: "Not", LEMMA: "not", TAG: "RB"},
         {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
     ],
 
-    "mightn't": [
-        {ORTH: "might"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
+    "Notve": [
+        {ORTH: "Not", LEMMA: "not", TAG: "RB"},
+        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
     ],
 
-    "you're": [
-        {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'re", LEMMA: "be", NORM: "are"}
+    "cannot": [
+        {ORTH: "can", LEMMA: "can", TAG: "MD"},
+        {ORTH: "not", LEMMA: "not", TAG: "RB"}
     ],
 
-    "They've": [
-        {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "what'll": [
-        {ORTH: "what"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "Could've": [
-        {ORTH: "Could", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Would've": [
-        {ORTH: "Would"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Isn't": [
-        {ORTH: "Is", LEMMA: "be", TAG: "VBZ"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "let's": [
-        {ORTH: "let"},
-        {ORTH: "'s", LEMMA: "us"}
-    ],
-
-    "She'll": [
-        {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "You'd": [
-        {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "wouldnt": [
-        {ORTH: "would"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Why'll": [
-        {ORTH: "Why"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "Where'd": [
-        {ORTH: "Where"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Theyre": [
-        {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "re", LEMMA: "be", NORM: "are"}
-    ],
-
-    "Won't": [
-        {ORTH: "Wo", LEMMA: "will"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Couldn't": [
-        {ORTH: "Could", TAG: "MD"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "it's": [
-        {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'s"}
-    ],
-
-    "it'll": [
-        {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "They'd've": [
-        {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Ima": [
-        {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ma"}
+    "Cannot": [
+        {ORTH: "Can", LEMMA: "can", TAG: "MD"},
+        {ORTH: "not", LEMMA: "not", TAG: "RB"}
     ],
 
     "gonna": [
@@ -1342,452 +451,45 @@ TOKENIZER_EXCEPTIONS = {
         {ORTH: "na", LEMMA: "to"}
     ],
 
-    "whats": [
-        {ORTH: "what"},
-        {ORTH: "s"}
+    "gotta": [
+        {ORTH: "got"},
+        {ORTH: "ta", LEMMA: "to"}
     ],
 
-    "How's": [
-        {ORTH: "How"},
-        {ORTH: "'s"}
+    "Gotta": [
+        {ORTH: "Got"},
+        {ORTH: "ta", LEMMA: "to"}
     ],
 
-    "Shouldntve": [
-        {ORTH: "Should"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
+    "let's": [
+        {ORTH: "let"},
+        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}
     ],
 
-    "youd": [
-        {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Whatll": [
-        {ORTH: "What"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "Wouldn't've": [
-        {ORTH: "Would"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "How'd": [
-        {ORTH: "How"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "doesnt": [
-        {ORTH: "does", LEMMA: "do", TAG: "VBZ"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Shouldn't": [
-        {ORTH: "Should"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "He'd've": [
-        {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Mightntve": [
-        {ORTH: "Might"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "couldnt": [
-        {ORTH: "could", TAG: "MD"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Haven't": [
-        {ORTH: "Have", TAG: "VB"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "doesn't": [
-        {ORTH: "does", LEMMA: "do", TAG: "VBZ"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Hasn't": [
-        {ORTH: "Has"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "how's": [
-        {ORTH: "how"},
-        {ORTH: "'s"}
-    ],
-
-    "hes": [
-        {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "s"}
-    ],
-
-    "he'll": [
-        {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "hed": [
-        {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "how'd": [
-        {ORTH: "how"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "we're": [
-        {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'re", LEMMA: "be", NORM :"are"}
-    ],
-
-    "Hadnt": [
-        {ORTH: "Had", LEMMA: "have", TAG: "VBD"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Shant": [
-        {ORTH: "Sha", LEMMA: "shall"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Theyve": [
-        {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Hows": [
-        {ORTH: "How"},
-        {ORTH: "s"}
-    ],
-
-    "We'll": [
-        {ORTH: "We"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "i've": [
-        {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Whove": [
-        {ORTH: "Who"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "i'ma": [
-        {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ma"}
-    ],
-
-    "Howd": [
-        {ORTH: "How"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "hadnt": [
-        {ORTH: "had", LEMMA: "have", TAG: "VBD"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "shant": [
-        {ORTH: "sha", LEMMA: "shall"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "There'd've": [
-        {ORTH: "There"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "I'll": [
-        {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "Why's": [
-        {ORTH: "Why"},
-        {ORTH: "'s"}
-    ],
-
-    "Shouldn't've": [
-        {ORTH: "Should"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Wasnt": [
-        {ORTH: "Was"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "whove": [
-        {ORTH: "who"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "hasn't": [
-        {ORTH: "has"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "wouldntve": [
-        {ORTH: "would"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Wheres": [
-        {ORTH: "Where"},
-        {ORTH: "s"}
-    ],
-
-    "How'll": [
-        {ORTH: "How"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "there'd've": [
-        {ORTH: "there"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Whos": [
-        {ORTH: "Who"},
-        {ORTH: "s"}
-    ],
-
-    "shes": [
-        {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "s"}
-    ],
-
-    "Doesn't": [
-        {ORTH: "Does", LEMMA: "do", TAG: "VBZ"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Arent": [
-        {ORTH: "Are", TAG: "VBP", "number": 2, LEMMA: "be"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Hasnt": [
-        {ORTH: "Has", LEMMA: "have"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "He's": [
-        {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'s"}
-    ],
-
-    "wasnt": [
-        {ORTH: "was"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "whyll": [
-        {ORTH: "why"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "mustnt": [
-        {ORTH: "must"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "He'd": [
-        {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Shes": [
-        {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "s"}
-    ],
-
-    "where've": [
-        {ORTH: "where"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Youll": [
-        {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "hasnt": [
-        {ORTH: "has"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "theyll": [
-        {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "it'd've": [
-        {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "itdve": [
-        {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "wedve": [
-        {ORTH: "we"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Werent": [
-        {ORTH: "Were"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Therell": [
-        {ORTH: "There"},
-        {ORTH: "ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "shan't": [
-        {ORTH: "sha", LEMMA: "shall"},
-        {ORTH: "n't", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "Wont": [
-        {ORTH: "Wo", LEMMA: "will"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "hadntve": [
-        {ORTH: "had", LEMMA: "have", TAG: "VBD"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "who've": [
-        {ORTH: "who"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "Whatre": [
-        {ORTH: "What"},
-        {ORTH: "re", LEMMA: "be", NORM: "are"}
-    ],
-
-    "'s": [
-        {ORTH: "'s", LEMMA: "'s"}
-    ],
-
-    "where'd": [
-        {ORTH: "where"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "shouldve": [
-        {ORTH: "should"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
-
-    "where's": [
-        {ORTH: "where"},
-        {ORTH: "'s"}
-    ],
-
-    "neednt": [
-        {ORTH: "need"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "It'll": [
-        {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "We'd": [
-        {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "Whats": [
-        {ORTH: "What"},
-        {ORTH: "s"}
+    "Let's": [
+        {ORTH: "Let", LEMMA: "let"},
+        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}
     ],
 
     "\u2014": [
         {ORTH: "\u2014", TAG: ":", LEMMA: "--"}
     ],
 
-    "Itd": [
-        {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
+    "\n": [
+        {ORTH: "\n", TAG: "SP"}
     ],
 
-    "she'd": [
-        {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'d", LEMMA: "would", TAG: "MD"}
-    ],
+    "\t": [
+        {ORTH: "\t", TAG: "SP"}
+    ]
+}
 
-    "Mustnt": [
-        {ORTH: "Must"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
 
-    "Notve": [
-        {ORTH: "Not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}
-    ],
+# Abbreviations
 
-    "you'll": [
-        {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'ll", LEMMA: "will", TAG: "MD"}
-    ],
-
-    "Theyd": [
-        {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "d", LEMMA: "would", TAG: "MD"}
-    ],
-
-    "she's": [
-        {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"},
-        {ORTH: "'s"}
-    ],
-
-    "Couldnt": [
-        {ORTH: "Could", TAG: "MD"},
-        {ORTH: "nt", LEMMA: "not", TAG: "RB"}
-    ],
-
-    "that's": [
-        {ORTH: "that"},
-        {ORTH: "'s"}
-    ],
-
-    "'em": [
-        {ORTH: "'em", LEMMA: PRON_LEMMA, NORM: "them"}
-    ],
-
-    "ol'": [
-        {ORTH: "ol'", LEMMA: "old"}
+ABBREVIATIONS = {
+    "Mt.": [
+        {ORTH: "Mt.", LEMMA: "Mount"}
     ],
 
     "Ak.": [
@@ -2000,41 +702,41 @@ TOKENIZER_EXCEPTIONS = {
 }
 
 
+TOKENIZER_EXCEPTIONS = dict(EXC)
+TOKENIZER_EXCEPTIONS.update(OTHER)
+TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)
+
+
+# Remove EXCLUDE_EXC if in exceptions
+
+for string in EXCLUDE_EXC:
+    if string in TOKENIZER_EXCEPTIONS:
+        TOKENIZER_EXCEPTIONS.pop(string)
+
+
+# Abbreviations with only one ORTH token
+
 ORTH_ONLY = [
-    "''",
-    "\")",
-    "a.",
+    "'d",
     "a.m.",
     "Adm.",
-    "b.",
     "Bros.",
-    "c.",
     "co.",
     "Co.",
     "Corp.",
-    "d.",
     "D.C.",
     "Dr.",
-    "e.",
     "e.g.",
     "E.g.",
     "E.G.",
-    "f.",
-    "g.",
     "Gen.",
     "Gov.",
-    "h.",
-    "i.",
     "i.e.",
     "I.e.",
     "I.E.",
     "Inc.",
-    "j.",
     "Jr.",
-    "k.",
-    "l.",
     "Ltd.",
-    "m.",
     "Md.",
     "Messrs.",
     "Mo.",
@@ -2042,24 +744,11 @@ ORTH_ONLY = [
     "Mr.",
     "Mrs.",
     "Ms.",
-    "n.",
-    "o.",
-    "p.",
     "p.m.",
     "Ph.D.",
-    "q.",
-    "r.",
     "Rep.",
     "Rev.",
-    "s.",
     "Sen.",
     "St.",
-    "t.",
-    "u.",
-    "v.",
-    "vs.",
-    "w.",
-    "x.",
-    "y.",
-    "z."
+    "vs."
 ]
diff --git a/spacy/es/language_data.py b/spacy/es/language_data.py
index 3357c9ac8..7c44752cb 100644
--- a/spacy/es/language_data.py
+++ b/spacy/es/language_data.py
@@ -40,11 +40,14 @@ def get_time_exc(hours):
     return exc
 
 
-TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 STOP_WORDS = set(STOP_WORDS)
 
+
+TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
 update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
 
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
diff --git a/spacy/es/tokenizer_exceptions.py b/spacy/es/tokenizer_exceptions.py
index f9259ce93..93bc74642 100644
--- a/spacy/es/tokenizer_exceptions.py
+++ b/spacy/es/tokenizer_exceptions.py
@@ -85,55 +85,29 @@ TOKENIZER_EXCEPTIONS = {
 
 
 ORTH_ONLY = [
-    "a.",
     "a.C.",
     "a.J.C.",
     "apdo.",
     "Av.",
     "Avda.",
-    "b.",
-    "c.",
     "Cía.",
-    "d.",
-    "e.",
     "etc.",
-    "f.",
-    "g.",
     "Gob.",
     "Gral.",
-    "h.",
-    "i.",
     "Ing.",
-    "j.",
     "J.C.",
-    "k.",
-    "l.",
     "Lic.",
-    "m.",
     "m.n.",
-    "n.",
     "no.",
     "núm.",
-    "o.",
-    "p.",
     "P.D.",
     "Prof.",
     "Profa.",
-    "q.",
     "q.e.p.d."
-    "r.",
-    "s.",
     "S.A.",
     "S.L.",
     "s.s.s.",
     "Sr.",
     "Sra.",
-    "Srta.",
-    "t.",
-    "u.",
-    "v.",
-    "w.",
-    "x.",
-    "y.",
-    "z."
+    "Srta."
 ]
diff --git a/spacy/fr/language_data.py b/spacy/fr/language_data.py
index e612fe064..bbbeb1535 100644
--- a/spacy/fr/language_data.py
+++ b/spacy/fr/language_data.py
@@ -2,13 +2,16 @@
 from __future__ import unicode_literals
 
 from .. import language_data as base
-from ..language_data import strings_to_exc
+from ..language_data import strings_to_exc, update_exc
 
 from .stop_words import STOP_WORDS
 
 
-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)
 
 
+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py
index 94eeb6f4d..49652c5ac 100644
--- a/spacy/hu/language_data.py
+++ b/spacy/hu/language_data.py
@@ -4,21 +4,25 @@ from __future__ import unicode_literals
 import six
 
 from spacy.language_data import strings_to_exc, update_exc
-from .punctuations import *
+from .punctuation import *
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import ABBREVIATIONS
 from .tokenizer_exceptions import OTHER_EXC
 from .. import language_data as base
 
+
 STOP_WORDS = set(STOP_WORDS)
+
+
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
-TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES
-TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
-TOKENIZER_INFIXES = TOKENIZER_INFIXES
-
-# HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]
-
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
 
+
+TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES
+TOKENIZER_SUFFIXES = base.TOKENIZER_SUFFIXES + TOKENIZER_SUFFIXES
+TOKENIZER_INFIXES = TOKENIZER_INFIXES
+
+
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
diff --git a/spacy/hu/punctuation.py b/spacy/hu/punctuation.py
new file mode 100644
index 000000000..e28052fd3
--- /dev/null
+++ b/spacy/hu/punctuation.py
@@ -0,0 +1,25 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..language_data.punctuation import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES
+
+
+TOKENIZER_SUFFIXES = [
+    r'(?<=[{al})])-e'.format(al=ALPHA_LOWER)
+]
+
+TOKENIZER_INFIXES = [
+    r'(?<=[0-9])-(?=[0-9])',
+    r'(?<=[0-9])[+\-\*/^](?=[0-9])',
+    r'(?<=[{a}])--(?=[{a}])',
+    r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
+    r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+    r'(?<=[0-9{a}])"(?=[\-{a}])'.format(a=ALPHA),
+    r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
+]
+
+
+TOKENIZER_INFIXES += LIST_ELLIPSES
+
+
+__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
diff --git a/spacy/hu/punctuations.py b/spacy/hu/punctuations.py
deleted file mode 100644
index 3681a2fbe..000000000
--- a/spacy/hu/punctuations.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-TOKENIZER_PREFIXES = r'''
-+
-'''.strip().split('\n')
-
-TOKENIZER_SUFFIXES = r'''
-,
-\"
-\)
-\]
-\}
-\*
-\!
-\?
-\$
->
-:
-;
-'
-”
-“
-«
-_
-''
-’
-‘
-€
-\.\.
-\.\.\.
-\.\.\.\.
-(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”+-])\.
-(?<=[a-züóőúéáűí)])-e
-\-\-
-´
-(?<=[0-9])\+
-(?<=[a-z0-9üóőúéáűí][\)\]”"'%\)§/])\.
-(?<=[0-9])km²
-(?<=[0-9])m²
-(?<=[0-9])cm²
-(?<=[0-9])mm²
-(?<=[0-9])km³
-(?<=[0-9])m³
-(?<=[0-9])cm³
-(?<=[0-9])mm³
-(?<=[0-9])ha
-(?<=[0-9])km
-(?<=[0-9])m
-(?<=[0-9])cm
-(?<=[0-9])mm
-(?<=[0-9])µm
-(?<=[0-9])nm
-(?<=[0-9])yd
-(?<=[0-9])in
-(?<=[0-9])ft
-(?<=[0-9])kg
-(?<=[0-9])g
-(?<=[0-9])mg
-(?<=[0-9])µg
-(?<=[0-9])t
-(?<=[0-9])lb
-(?<=[0-9])oz
-(?<=[0-9])m/s
-(?<=[0-9])km/h
-(?<=[0-9])mph
-(?<=°[FCK])\.
-(?<=[0-9])hPa
-(?<=[0-9])Pa
-(?<=[0-9])mbar
-(?<=[0-9])mb
-(?<=[0-9])T
-(?<=[0-9])G
-(?<=[0-9])M
-(?<=[0-9])K
-(?<=[0-9])kb
-'''.strip().split('\n')
-
-TOKENIZER_INFIXES = r'''
-…
-\.\.+
-(?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ])
-(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
-(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
-(?<=[0-9])[+\-\*/^](?=[0-9])
-(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
-'''.strip().split('\n')
-
-__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
diff --git a/spacy/hu/tokenizer_exceptions.py b/spacy/hu/tokenizer_exceptions.py
index 627035bb8..46122564c 100644
--- a/spacy/hu/tokenizer_exceptions.py
+++ b/spacy/hu/tokenizer_exceptions.py
@@ -111,7 +111,6 @@ Vcs.
 Vhr.
 X.Y.
 Zs.
-a.
 a.C.
 ac.
 adj.
@@ -126,7 +125,6 @@ ang.
 arch.
 at.
 aug.
-b.
 b.a.
 b.s.
 b.sc.
@@ -141,7 +139,6 @@ br.
 bsc.
 bt.
 btk.
-c.
 ca.
 cc.
 cca.
@@ -155,7 +152,6 @@ csc.
 csüt.
 cső.
 ctv.
-d.
 dbj.
 dd.
 ddr.
@@ -170,7 +166,6 @@ dolg.
 dr.
 du.
 dzs.
-e.
 ea.
 ed.
 eff.
@@ -186,7 +181,6 @@ etc.
 ev.
 ezr.
 eü.
-f.
 f.h.
 f.é.
 fam.
@@ -213,7 +207,6 @@ főig.
 főisk.
 főtörm.
 főv.
-g.
 gazd.
 gimn.
 gk.
@@ -225,7 +218,6 @@ gy.
 gyak.
 gyártm.
 gör.
-h.
 hads.
 hallg.
 hdm.
@@ -266,7 +258,6 @@ isk.
 ism.
 izr.
 iá.
-j.
 jan.
 jav.
 jegyz.
@@ -278,7 +269,6 @@ jr.
 jvb.
 júl.
 jún.
-k.
 karb.
 kat.
 kb.
@@ -313,7 +303,6 @@ közl.
 közp.
 közt.
 kü.
-l.
 lat.
 ld.
 legs.
@@ -324,7 +313,6 @@ lt.
 ltd.
 ltp.
 luth.
-m.
 m.a.
 m.s.
 m.sc.
@@ -359,7 +347,6 @@ műh.
 műsz.
 műv.
 művez.
-n.
 nagyker.
 nagys.
 nat.
@@ -372,7 +359,6 @@ ny.
 nyilv.
 nyrt.
 nyug.
-o.
 obj.
 okl.
 okt.
@@ -381,7 +367,6 @@ orsz.
 ort.
 ov.
 ovh.
-p.
 pf.
 pg.
 ph.d
@@ -404,8 +389,6 @@ pság.
 ptk.
 pu.
 pü.
-q.
-r.
 r.k.
 rac.
 rad.
@@ -420,7 +403,6 @@ rkt.
 rt.
 rtg.
 röv.
-s.
 s.b.
 s.k.
 sa.
@@ -450,7 +432,6 @@ szt.
 szubj.
 szöv.
 szül.
-t.
 tanm.
 tb.
 tbk.
@@ -476,13 +457,11 @@ tvr.
 ty.
 törv.
 tü.
-u.
 ua.
 ui.
 unit.
 uo.
 uv.
-v.
 vas.
 vb.
 vegy.
@@ -501,9 +480,6 @@ vv.
 vál.
 vízv.
 vö.
-w.
-y.
-z.
 zrt.
 zs.
 Ész.
@@ -520,7 +496,6 @@ zs.
 évf.
 í.
 ó.
-ö.
 össz.
 ötk.
 özv.
@@ -528,7 +503,6 @@ zs.
 úm.
 ún.
 út.
-ü.
 üag.
 üd.
 üdv.
@@ -544,6 +518,5 @@ zs.
 """.strip().split()
 
 OTHER_EXC = """
-''
 -e
 """.strip().split()
diff --git a/spacy/it/__init__.py b/spacy/it/__init__.py
index 2ef60fd94..bc0d13cab 100644
--- a/spacy/it/__init__.py
+++ b/spacy/it/__init__.py
@@ -1,8 +1,6 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 
-from os import path
-
 from ..language import Language
 from ..attrs import LANG
 
diff --git a/spacy/it/language_data.py b/spacy/it/language_data.py
index 8683f83ac..a4a657c33 100644
--- a/spacy/it/language_data.py
+++ b/spacy/it/language_data.py
@@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS
 
 
-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)
 
 
+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py
index f6aa4317c..43a4ef0be 100644
--- a/spacy/language_data/__init__.py
+++ b/spacy/language_data/__init__.py
@@ -1,3 +1,4 @@
+from .abbreviations import *
 from .emoticons import *
 from .punctuation import *
 from .tag_map import *
diff --git a/spacy/language_data/abbreviations.py b/spacy/language_data/abbreviations.py
new file mode 100644
index 000000000..b49daa0ad
--- /dev/null
+++ b/spacy/language_data/abbreviations.py
@@ -0,0 +1,43 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+ABBREVIATIONS = [
+    "'",
+    "\\\")",
+    "<space>",
+    "''",
+    "C++",
+    "a.",
+    "b.",
+    "c.",
+    "d.",
+    "e.",
+    "f.",
+    "g.",
+    "h.",
+    "i.",
+    "j.",
+    "k.",
+    "l.",
+    "m.",
+    "n.",
+    "o.",
+    "p.",
+    "q.",
+    "r.",
+    "s.",
+    "t.",
+    "u.",
+    "v.",
+    "w.",
+    "x.",
+    "y.",
+    "z.",
+    "ä.",
+    "ö.",
+    "ü."
+]
+
+
+__all__ = [ "ABBREVIATIONS" ]
diff --git a/spacy/language_data/emoticons.py b/spacy/language_data/emoticons.py
index 3fa44368d..bc951a007 100644
--- a/spacy/language_data/emoticons.py
+++ b/spacy/language_data/emoticons.py
@@ -13,6 +13,7 @@ EMOTICONS = set("""
 (-:
 =)
 (=
+")
 :]
 :-]
 [:
diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py
index fb784271e..d8ed19ca1 100644
--- a/spacy/language_data/punctuation.py
+++ b/spacy/language_data/punctuation.py
@@ -1,133 +1,115 @@
 # encoding: utf8
 from __future__ import unicode_literals
 
-
-TOKENIZER_PREFIXES = r'''
-,
-"
-(
-[
-{
-*
-<
->
-$
-£
-¡
-¿
-„
-“
-'
-``
-`
-#
-‘
-....
-...
-…
-‚
-»
-§
-US$
-C$
-A$
-a-
-'''.strip().split('\n')
+import re
 
 
-TOKENIZER_SUFFIXES = r'''
-,
-\"
-\)
-\]
-\}
-\*
-\!
-\?
-%
-\$
->
-:
-;
-'
-”
-“
-«
-_
-''
-'s
-'S
-’s
-’S
-’
-‘
-°
-€
-…
-\.\.
-\.\.\.
-\.\.\.\.
-(?<=[a-z0-9)\]”"'%\)])\.
-(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
-\-\-
-´
-(?<=[0-9])km²
-(?<=[0-9])m²
-(?<=[0-9])cm²
-(?<=[0-9])mm²
-(?<=[0-9])km³
-(?<=[0-9])m³
-(?<=[0-9])cm³
-(?<=[0-9])mm³
-(?<=[0-9])ha
-(?<=[0-9])km
-(?<=[0-9])m
-(?<=[0-9])cm
-(?<=[0-9])mm
-(?<=[0-9])µm
-(?<=[0-9])nm
-(?<=[0-9])yd
-(?<=[0-9])in
-(?<=[0-9])ft
-(?<=[0-9])kg
-(?<=[0-9])g
-(?<=[0-9])mg
-(?<=[0-9])µg
-(?<=[0-9])t
-(?<=[0-9])lb
-(?<=[0-9])oz
-(?<=[0-9])m/s
-(?<=[0-9])km/h
-(?<=[0-9])mph
-(?<=[0-9])°C
-(?<=[0-9])°K
-(?<=[0-9])°F
-(?<=[0-9])hPa
-(?<=[0-9])Pa
-(?<=[0-9])mbar
-(?<=[0-9])mb
-(?<=[0-9])T
-(?<=[0-9])G
-(?<=[0-9])M
-(?<=[0-9])K
-(?<=[0-9])kb
-'''.strip().split('\n')
+_ALPHA_LOWER = """
+a ä à á â ǎ æ ã å ā ă ą b c ç ć č ĉ ċ c̄ d ð ď e é è ê ë ė ȅ ȩ ẽ ę f g ĝ ğ h i ı
+î ï í ī ì ȉ ǐ į ĩ j k ķ l ł ļ m n ñ ń ň ņ o ö ó ò ő ô õ œ ø ō ő ǒ ơ p q r ř ŗ s
+ß ś š ş ŝ t ť u ú û ù ú ū ű ǔ ů ų ư v w ŵ x y ÿ ý ỳ ŷ ỹ z ź ž ż þ
+"""
 
 
-TOKENIZER_INFIXES = r'''
-…
-\.\.\.+
-(?<=[a-z])\.(?=[A-Z])
-(?<=[a-z])\.(?=[A-Z])
-(?<=[a-zA-Z])-(?=[a-zA-z])
-(?<=[a-zA-Z])--(?=[a-zA-z])
-(?<=[0-9])-(?=[0-9])
-(?<=[A-Za-z]),(?=[A-Za-z])
-(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
-(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
-(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
-(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
-'''.strip().split('\n')
+_ALPHA_UPPER = """
+A Ä À Á Â Ǎ Æ Ã Å Ā Ă Ą B C Ç Ć Č Ĉ Ċ C̄ D Ð Ď E É È Ê Ë Ė Ȅ Ȩ Ẽ Ę F G Ĝ Ğ H I İ
+Î Ï Í Ī Ì Ȉ Ǐ Į Ĩ J K Ķ L Ł Ļ M N Ñ Ń Ň Ņ O Ö Ó Ò Ő Ô Õ Œ Ø Ō Ő Ǒ Ơ P Q R Ř Ŗ S
+Ś Š Ş Ŝ T Ť U Ú Û Ù Ú Ū Ű Ǔ Ů Ų Ư V W Ŵ X Y Ÿ Ý Ỳ Ŷ Ỹ Z Ź Ž Ż Þ
+"""
+
+
+_UNITS = """
+km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft kg g mg
+µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb
+TB T G M K
+"""
+
+
+_CURRENCY = r"""
+\$ £ € ¥ ฿ US\$ C\$ A\$
+"""
+
+
+_QUOTES = r"""
+' '' " ” “ `` ` ‘ ´ ‚ , „ » «
+"""
+
+
+_PUNCT = r"""
+… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &
+"""
+
+
+_HYPHENS = r"""
+- – — -- ---
+"""
+
+
+LIST_ELLIPSES = [
+    r'\.\.+',
+    "…"
+]
+
+
+LIST_CURRENCY = list(_CURRENCY.strip().split())
+LIST_QUOTES = list(_QUOTES.strip().split())
+LIST_PUNCT = list(_PUNCT.strip().split())
+LIST_HYPHENS = list(_HYPHENS.strip().split())
+
+
+ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '')
+ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '')
+ALPHA = ALPHA_LOWER + ALPHA_UPPER
+
+
+QUOTES = _QUOTES.strip().replace(' ', '|')
+CURRENCY = _CURRENCY.strip().replace(' ', '|')
+UNITS = _UNITS.strip().replace(' ', '|')
+HYPHENS = _HYPHENS.strip().replace(' ', '|')
+
+
+
+# Prefixes
+
+TOKENIZER_PREFIXES = (
+    ['§', '%', r'\+'] +
+    LIST_PUNCT +
+    LIST_ELLIPSES +
+    LIST_QUOTES +
+    LIST_CURRENCY
+)
+
+
+# Suffixes
+
+TOKENIZER_SUFFIXES = (
+    LIST_PUNCT +
+    LIST_ELLIPSES +
+    LIST_QUOTES +
+    [
+        r'(?<=[0-9])\+',
+        r'(?<=°[FfCcKk])\.',
+        r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
+        r'(?<=[0-9])(?:{u})'.format(u=UNITS),
+        r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES),
+        r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER),
+        "'s", "'S", "’s", "’S"
+    ]
+)
+
+
+# Infixes
+
+TOKENIZER_INFIXES = (
+    LIST_ELLIPSES +
+    [
+        r'(?<=[0-9])[+\-\*/^](?=[0-9])',
+        r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+        r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
+        r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
+        r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
+    ]
+)
 
 
 __all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
diff --git a/spacy/language_data/tag_map.py b/spacy/language_data/tag_map.py
index f5b6b5040..966960721 100644
--- a/spacy/language_data/tag_map.py
+++ b/spacy/language_data/tag_map.py
@@ -20,5 +20,6 @@ TAG_MAP = {
     "X":        {POS: X},
     "CONJ":     {POS: CONJ},
     "ADJ":      {POS: ADJ},
-    "VERB":     {POS: VERB}
+    "VERB":     {POS: VERB},
+    "PART":     {POS: PART}
 }
diff --git a/spacy/nl/__init__.py b/spacy/nl/__init__.py
index d958783ea..d4aa39506 100644
--- a/spacy/nl/__init__.py
+++ b/spacy/nl/__init__.py
@@ -1,8 +1,6 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 
-from os import path
-
 from ..language import Language
 from ..attrs import LANG
 from .language_data import *
diff --git a/spacy/nl/language_data.py b/spacy/nl/language_data.py
index 8683f83ac..a4a657c33 100644
--- a/spacy/nl/language_data.py
+++ b/spacy/nl/language_data.py
@@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS
 
 
-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)
 
 
+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
diff --git a/spacy/pt/__init__.py b/spacy/pt/__init__.py
index 06c6417dc..ed26fb0b3 100644
--- a/spacy/pt/__init__.py
+++ b/spacy/pt/__init__.py
@@ -1,8 +1,6 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 
-from os import path
-
 from ..language import Language
 from ..attrs import LANG
 
diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py
index 8683f83ac..a4a657c33 100644
--- a/spacy/pt/language_data.py
+++ b/spacy/pt/language_data.py
@@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS
 
 
-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)
 
 
+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
diff --git a/spacy/sv/__init__.py b/spacy/sv/__init__.py
index 25930386a..e03c9a56f 100644
--- a/spacy/sv/__init__.py
+++ b/spacy/sv/__init__.py
@@ -1,8 +1,6 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 
-from os import path
-
 from ..language import Language
 from ..attrs import LANG
 from .language_data import *
diff --git a/spacy/sv/language_data.py b/spacy/sv/language_data.py
index 8683f83ac..a4a657c33 100644
--- a/spacy/sv/language_data.py
+++ b/spacy/sv/language_data.py
@@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS
 
 
-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)
 
 
+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
diff --git a/spacy/tests/website/__init__.py b/spacy/tests/de/__init__.py
similarity index 100%
rename from spacy/tests/website/__init__.py
rename to spacy/tests/de/__init__.py
diff --git a/spacy/tests/de/conftest.py b/spacy/tests/de/conftest.py
new file mode 100644
index 000000000..c6b8be26e
--- /dev/null
+++ b/spacy/tests/de/conftest.py
@@ -0,0 +1,11 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+from ...de import German
+
+
+@pytest.fixture
+def de_tokenizer():
+    return German.Defaults.create_tokenizer()
diff --git a/spacy/tests/de/tokenizer/__init__.py b/spacy/tests/de/tokenizer/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/de/tokenizer/test_exceptions.py b/spacy/tests/de/tokenizer/test_exceptions.py
new file mode 100644
index 000000000..13da3dc33
--- /dev/null
+++ b/spacy/tests/de/tokenizer/test_exceptions.py
@@ -0,0 +1,27 @@
+# coding: utf-8
+"""Test that tokenizer exceptions and emoticons are handles correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
+def test_tokenizer_splits_contractions(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 2
+
+
+@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
+def test_tokenizer_handles_abbr(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 1
+
+
+def test_tokenizer_handles_exc_in_text(de_tokenizer):
+    text = "Ich bin z.Zt. im Urlaub."
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 6
+    assert tokens[2].text == "z.Zt."
+    assert tokens[2].lemma_ == "zur Zeit"
diff --git a/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py b/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py
new file mode 100644
index 000000000..dcf4f4ef0
--- /dev/null
+++ b/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py
@@ -0,0 +1,116 @@
+# coding: utf-8
+"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('text', ["(unter)"])
+def test_tokenizer_splits_no_special(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["unter'm"])
+def test_tokenizer_splits_no_punct(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 2
+
+
+@pytest.mark.parametrize('text', ["(unter'm"])
+def test_tokenizer_splits_prefix_punct(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["unter'm)"])
+def test_tokenizer_splits_suffix_punct(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["(unter'm)"])
+def test_tokenizer_splits_even_wrap(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize('text', ["(unter'm?)"])
+def test_tokenizer_splits_uneven_wrap(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 5
+
+
+@pytest.mark.parametrize('text,length', [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
+def test_tokenizer_splits_prefix_interact(de_tokenizer, text, length):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text', ["z.B.)"])
+def test_tokenizer_splits_suffix_interact(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 2
+
+
+@pytest.mark.parametrize('text', ["(z.B.)"])
+def test_tokenizer_splits_even_wrap_interact(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["(z.B.?)"])
+def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize('text', ["blau-rot"])
+def test_tokenizer_splits_hyphens(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
+def test_tokenizer_splits_numeric_range(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["blau.Rot", "Hallo.Welt"])
+def test_tokenizer_splits_period_infix(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["Hallo,Welt", "eins,zwei"])
+def test_tokenizer_splits_comma_infix(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 3
+    assert tokens[0].text == text.split(",")[0]
+    assert tokens[1].text == ","
+    assert tokens[2].text == text.split(",")[1]
+
+
+@pytest.mark.parametrize('text', ["blau...Rot", "blau...rot"])
+def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 3
+
+
+def test_tokenizer_splits_double_hyphen_infix(de_tokenizer):
+    tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.")
+    assert len(tokens) == 12
+    assert tokens[0].text == "Viele"
+    assert tokens[1].text == "Regeln"
+    assert tokens[2].text == "--"
+    assert tokens[3].text == "wie"
+    assert tokens[4].text == "die"
+    assert tokens[5].text == "Bindestrich"
+    assert tokens[6].text == "-"
+    assert tokens[7].text == "Regeln"
+    assert tokens[8].text == "--"
+    assert tokens[9].text == "sind"
+    assert tokens[10].text == "kompliziert"
diff --git a/spacy/tests/de/tokenizer/test_text.py b/spacy/tests/de/tokenizer/test_text.py
new file mode 100644
index 000000000..84fa6f2a5
--- /dev/null
+++ b/spacy/tests/de/tokenizer/test_text.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+"""Test that longer and mixed texts are tokenized correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_tokenizer_handles_long_text(de_tokenizer):
+    text = """Die Verwandlung
+
+Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte, fand er sich in
+seinem Bett zu einem ungeheueren Ungeziefer verwandelt.
+
+Er lag auf seinem panzerartig harten Rücken und sah, wenn er den Kopf ein wenig
+hob, seinen gewölbten, braunen, von bogenförmigen Versteifungen geteilten
+Bauch, auf dessen Höhe sich die Bettdecke, zum gänzlichen Niedergleiten bereit,
+kaum noch erhalten konnte. Seine vielen, im Vergleich zu seinem sonstigen
+Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
+
+»Was ist mit mir geschehen?«, dachte er."""
+
+    tokens = de_tokenizer(text)
+    assert len(tokens) == 109
+
+
+@pytest.mark.parametrize('text,length', [
+    ("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1),
+    ("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1),
+    ("Kraftfahrzeug-Haftpflichtversicherung", 3),
+    ("Vakuum-Mittelfrequenz-Induktionsofen", 5)
+    ])
+def test_tokenizer_handles_long_words(de_tokenizer, text, length):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text,length', [
+    ("»Was ist mit mir geschehen?«, dachte er.", 12),
+    ("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15)
+    ])
+def test_tokenizer_handles_examples(de_tokenizer, text, length):
+    tokens = de_tokenizer(text)
+    assert len(tokens) == length
diff --git a/spacy/tests/en/__init__.py b/spacy/tests/en/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/en/conftest.py b/spacy/tests/en/conftest.py
new file mode 100644
index 000000000..3a3516c41
--- /dev/null
+++ b/spacy/tests/en/conftest.py
@@ -0,0 +1,11 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+from ...en import English
+
+
+@pytest.fixture
+def en_tokenizer():
+    return English.Defaults.create_tokenizer()
diff --git a/spacy/tests/en/tokenizer/__init__.py b/spacy/tests/en/tokenizer/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/en/tokenizer/test_contractions.py b/spacy/tests/en/tokenizer/test_contractions.py
new file mode 100644
index 000000000..a97b8f5ba
--- /dev/null
+++ b/spacy/tests/en/tokenizer/test_contractions.py
@@ -0,0 +1,87 @@
+# coding: utf-8
+"""Test that tokens are created correctly for contractions."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_tokenizer_handles_basic_contraction(en_tokenizer):
+    text = "don't giggle"
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 3
+    assert tokens[1].text == "n't"
+    text = "i said don't!"
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 5
+    assert tokens[4].text == "!"
+
+
+@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
+def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
+def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
+    tokens = en_tokenizer(text_poss)
+    assert len(tokens) == 2
+    assert tokens[0].text == text
+    assert tokens[1].text == "'s"
+
+
+@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
+def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 2
+    assert tokens[0].text == text.split("'")[0]
+    assert tokens[1].text == "'"
+
+
+@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
+def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].text == text
+
+
+@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
+def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 2
+    assert tokens[0].text == text.split("'")[0]
+    assert tokens[1].text == "'ll"
+    assert tokens[1].lemma_ == "will"
+
+
+@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
+def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
+    tokens_lower = en_tokenizer(text_lower)
+    tokens_title = en_tokenizer(text_title)
+    assert tokens_title[0].text == tokens_lower[0].text.title()
+    assert tokens_lower[0].text == tokens_title[0].text.lower()
+    assert tokens_lower[1].text == tokens_title[1].text
+
+
+@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
+@pytest.mark.parametrize('contraction', ["'ll", "'d"])
+def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
+    tokens = en_tokenizer(pron + contraction)
+    assert tokens[0].text == pron
+    assert tokens[1].text == contraction
+
+
+@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
+def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
+    tokens = en_tokenizer(exc)
+    assert len(tokens) == 1
+
+
+@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
+def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
+    tokens = en_tokenizer(wo_punct)
+    assert len(tokens) == 2
+    tokens = en_tokenizer(w_punct)
+    assert len(tokens) == 3
diff --git a/spacy/tests/en/tokenizer/test_exceptions.py b/spacy/tests/en/tokenizer/test_exceptions.py
new file mode 100644
index 000000000..ac7ed452f
--- /dev/null
+++ b/spacy/tests/en/tokenizer/test_exceptions.py
@@ -0,0 +1,20 @@
+# coding: utf-8
+"""Test that tokenizer exceptions are handled correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
+def test_tokenizer_handles_abbr(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 1
+
+
+def test_tokenizer_handles_exc_in_text(en_tokenizer):
+    text = "It's mediocre i.e. bad."
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 6
+    assert tokens[3].text == "i.e."
diff --git a/spacy/tests/tokenizer/test_indices.py b/spacy/tests/en/tokenizer/test_indices.py
similarity index 91%
rename from spacy/tests/tokenizer/test_indices.py
rename to spacy/tests/en/tokenizer/test_indices.py
index 5df7bcc59..0ed6ca4dc 100644
--- a/spacy/tests/tokenizer/test_indices.py
+++ b/spacy/tests/en/tokenizer/test_indices.py
@@ -1,12 +1,14 @@
+# coding: utf-8
 """Test that token.idx correctly computes index into the original string."""
 
+
 from __future__ import unicode_literals
 
 import pytest
 
 
 def test_simple_punct(en_tokenizer):
-    text = 'to walk, do foo'
+    text = "to walk, do foo"
     tokens = en_tokenizer(text)
     assert tokens[0].idx == 0
     assert tokens[1].idx == 3
@@ -16,7 +18,7 @@ def test_simple_punct(en_tokenizer):
 
 
 def test_complex_punct(en_tokenizer):
-    text = 'Tom (D., Ill.)!'
+    text = "Tom (D., Ill.)!"
     tokens = en_tokenizer(text)
     assert tokens[0].idx == 0
     assert len(tokens[0]) == 3
diff --git a/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py b/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py
new file mode 100644
index 000000000..042934d4e
--- /dev/null
+++ b/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py
@@ -0,0 +1,136 @@
+# coding: utf-8
+"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('text', ["(can)"])
+def test_tokenizer_splits_no_special(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["can't"])
+def test_tokenizer_splits_no_punct(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 2
+
+
+@pytest.mark.parametrize('text', ["(can't"])
+def test_tokenizer_splits_prefix_punct(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["can't)"])
+def test_tokenizer_splits_suffix_punct(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["(can't)"])
+def test_tokenizer_splits_even_wrap(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize('text', ["(can't?)"])
+def test_tokenizer_splits_uneven_wrap(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 5
+
+
+@pytest.mark.parametrize('text,length', [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
+def test_tokenizer_splits_prefix_interact(en_tokenizer, text, length):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text', ["U.S.)"])
+def test_tokenizer_splits_suffix_interact(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 2
+
+
+@pytest.mark.parametrize('text', ["(U.S.)"])
+def test_tokenizer_splits_even_wrap_interact(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["(U.S.?)"])
+def test_tokenizer_splits_uneven_wrap_interact(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize('text', ["best-known"])
+def test_tokenizer_splits_hyphens(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
+def test_tokenizer_splits_numeric_range(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["best.Known", "Hello.World"])
+def test_tokenizer_splits_period_infix(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["Hello,world", "one,two"])
+def test_tokenizer_splits_comma_infix(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 3
+    assert tokens[0].text == text.split(",")[0]
+    assert tokens[1].text == ","
+    assert tokens[2].text == text.split(",")[1]
+
+
+@pytest.mark.parametrize('text', ["best...Known", "best...known"])
+def test_tokenizer_splits_ellipsis_infix(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 3
+
+
+def test_tokenizer_splits_double_hyphen_infix(en_tokenizer):
+    tokens = en_tokenizer("No decent--let alone well-bred--people.")
+    assert tokens[0].text == "No"
+    assert tokens[1].text == "decent"
+    assert tokens[2].text == "--"
+    assert tokens[3].text == "let"
+    assert tokens[4].text == "alone"
+    assert tokens[5].text == "well"
+    assert tokens[6].text == "-"
+    assert tokens[7].text == "bred"
+    assert tokens[8].text == "--"
+    assert tokens[9].text == "people"
+
+
+@pytest.mark.xfail
+def test_tokenizer_splits_period_abbr(en_tokenizer):
+    text = "Today is Tuesday.Mr."
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 5
+    assert tokens[0].text == "Today"
+    assert tokens[1].text == "is"
+    assert tokens[2].text == "Tuesday"
+    assert tokens[3].text == "."
+    assert tokens[4].text == "Mr."
+
+
+@pytest.mark.xfail
+def test_tokenizer_splits_em_dash_infix(en_tokenizer):
+    # Re Issue #225
+    tokens = en_tokenizer("""Will this road take me to Puddleton?\u2014No, """
+                          """you'll have to walk there.\u2014Ariel.""")
+    assert tokens[6].text == "Puddleton"
+    assert tokens[7].text == "?"
+    assert tokens[8].text == "\u2014"
diff --git a/spacy/tests/en/tokenizer/test_punct.py b/spacy/tests/en/tokenizer/test_punct.py
new file mode 100644
index 000000000..b6ae9224d
--- /dev/null
+++ b/spacy/tests/en/tokenizer/test_punct.py
@@ -0,0 +1,132 @@
+# coding: utf-8
+"""Test that open, closed and paired punctuation is split off correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+from ....util import compile_prefix_regex
+from ....language_data import TOKENIZER_PREFIXES
+
+
+
+en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
+
+PUNCT_OPEN = ['(', '[', '{', '*']
+PUNCT_CLOSE = [')', ']', '}', '*']
+PUNCT_PAIRED = [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
+
+
+@pytest.mark.parametrize('text', ["(", "((", "<"])
+def test_tokenizer_handles_only_punct(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == len(text)
+
+
+@pytest.mark.parametrize('punct', PUNCT_OPEN)
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_open_punct(en_tokenizer, punct, text):
+    tokens = en_tokenizer(punct + text)
+    assert len(tokens) == 2
+    assert tokens[0].text == punct
+    assert tokens[1].text == text
+
+
+@pytest.mark.parametrize('punct', PUNCT_CLOSE)
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_close_punct(en_tokenizer, punct, text):
+    tokens = en_tokenizer(text + punct)
+    assert len(tokens) == 2
+    assert tokens[0].text == text
+    assert tokens[1].text == punct
+
+
+@pytest.mark.parametrize('punct', PUNCT_OPEN)
+@pytest.mark.parametrize('punct_add', ["`"])
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
+    tokens = en_tokenizer(punct + punct_add + text)
+    assert len(tokens) == 3
+    assert tokens[0].text == punct
+    assert tokens[1].text == punct_add
+    assert tokens[2].text == text
+
+
+@pytest.mark.parametrize('punct', PUNCT_CLOSE)
+@pytest.mark.parametrize('punct_add', ["'"])
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
+    tokens = en_tokenizer(text + punct + punct_add)
+    assert len(tokens) == 3
+    assert tokens[0].text == text
+    assert tokens[1].text == punct
+    assert tokens[2].text == punct_add
+
+
+@pytest.mark.parametrize('punct', PUNCT_OPEN)
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
+    tokens = en_tokenizer(punct + punct + punct + text)
+    assert len(tokens) == 4
+    assert tokens[0].text == punct
+    assert tokens[3].text == text
+
+
+@pytest.mark.parametrize('punct', PUNCT_CLOSE)
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
+    tokens = en_tokenizer(text + punct + punct + punct)
+    assert len(tokens) == 4
+    assert tokens[0].text == text
+    assert tokens[1].text == punct
+
+
+@pytest.mark.parametrize('text', ["'The"])
+def test_tokenizer_splits_open_appostrophe(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 2
+    assert tokens[0].text == "'"
+
+
+@pytest.mark.parametrize('text', ["Hello''"])
+def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 2
+    tokens_punct = en_tokenizer("''")
+    assert len(tokens_punct) == 1
+
+
+@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
+@pytest.mark.parametrize('text', ["Hello"])
+def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text):
+    tokens = en_tokenizer(punct_open + text + punct_close)
+    assert len(tokens) == 3
+    assert tokens[0].text == punct_open
+    assert tokens[1].text == text
+    assert tokens[2].text == punct_close
+
+
+@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
+@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")])
+@pytest.mark.parametrize('text', ["Hello"])
+def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text):
+    tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add)
+    assert len(tokens) == 5
+    assert tokens[0].text == punct_open_add
+    assert tokens[1].text == punct_open
+    assert tokens[2].text == text
+    assert tokens[3].text == punct_close
+    assert tokens[4].text == punct_close_add
+
+
+@pytest.mark.parametrize('text,punct', [("(can't", "(")])
+def test_tokenizer_splits_pre_punct_regex(text, punct):
+    match = en_search_prefixes(text)
+    assert match.group() == punct
+
+
+def test_tokenizer_splits_bracket_period(en_tokenizer):
+    text = "(And a 6a.m. run through Washington Park)."
+    tokens = en_tokenizer(text)
+    assert tokens[len(tokens) - 1].text == "."
diff --git a/spacy/tests/en/tokenizer/test_text.py b/spacy/tests/en/tokenizer/test_text.py
new file mode 100644
index 000000000..c7178fbf9
--- /dev/null
+++ b/spacy/tests/en/tokenizer/test_text.py
@@ -0,0 +1,36 @@
+# coding: utf-8
+"""Test that longer and mixed texts are tokenized correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_tokenizer_handles_long_text(en_tokenizer):
+    text = """Tributes pour in for late British Labour Party leader
+
+Tributes poured in from around the world Thursday
+to the late Labour Party leader John Smith, who died earlier from a massive
+heart attack aged 55.
+
+In Washington, the US State Department issued a statement regretting "the
+untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
+
+"Mr. Smith, throughout his distinguished"""
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 76
+
+
+@pytest.mark.parametrize('text,length', [
+    ("The U.S. Army likes Shock and Awe.", 8),
+    ("U.N. regulations are not a part of their concern.", 10),
+    ("“Isn't it?”", 6),
+    ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
+    ("""'Me too!', Mr. P. Delaware cried. """, 11),
+    ("They ran about 10km.", 6),
+    # ("But then the 6,000-year ice age came...", 10)
+    ])
+def test_tokenizer_handles_cnts(en_tokenizer, text, length):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == length
diff --git a/spacy/tests/hu/conftest.py b/spacy/tests/hu/conftest.py
new file mode 100644
index 000000000..222bd1b00
--- /dev/null
+++ b/spacy/tests/hu/conftest.py
@@ -0,0 +1,11 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+from ...hu import Hungarian
+
+
+@pytest.fixture
+def hu_tokenizer():
+    return Hungarian.Defaults.create_tokenizer()
diff --git a/spacy/tests/hu/tokenizer/test_tokenizer.py b/spacy/tests/hu/tokenizer/test_tokenizer.py
index 2bfbfdf36..0b76da0c6 100644
--- a/spacy/tests/hu/tokenizer/test_tokenizer.py
+++ b/spacy/tests/hu/tokenizer/test_tokenizer.py
@@ -2,25 +2,27 @@
 from __future__ import unicode_literals
 
 import pytest
-from spacy.hu import Hungarian
 
-_DEFAULT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
-                  ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
-                  ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
-                  ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
-                  ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
-                  ('A .hu.', ['A', '.hu', '.']),
-                  ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
-                  ('A pl.', ['A', 'pl.']),
-                  ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
-                  ('Egy..ket.', ['Egy', '..', 'ket', '.']),
-                  ('Valami... van.', ['Valami', '...', 'van', '.']),
-                  ('Valami ...van...', ['Valami', '...', 'van', '...']),
-                  ('Valami...', ['Valami', '...']),
-                  ('Valami ...', ['Valami', '...']),
-                  ('Valami ... más.', ['Valami', '...', 'más', '.'])]
 
-_HYPHEN_TESTS = [
+DEFAULT_TESTS = [
+    ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
+    ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
+    ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
+    ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
+    ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
+    ('A .hu.', ['A', '.hu', '.']),
+    ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
+    ('A pl.', ['A', 'pl.']),
+    ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
+    ('Egy..ket.', ['Egy', '..', 'ket', '.']),
+    ('Valami... van.', ['Valami', '...', 'van', '.']),
+    ('Valami ...van...', ['Valami', '...', 'van', '...']),
+    ('Valami...', ['Valami', '...']),
+    ('Valami ...', ['Valami', '...']),
+    ('Valami ... más.', ['Valami', '...', 'más', '.'])
+]
+
+HYPHEN_TESTS = [
     ('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']),
     ('Egy -nak.', ['Egy', '-nak', '.']),
     ('Egy bel-.', ['Egy', 'bel-', '.']),
@@ -39,195 +41,194 @@ _HYPHEN_TESTS = [
     ('A 7-es.', ['A', '7-es', '.']),
     ('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']),
     ('A %-sal.', ['A', '%-sal', '.']),
-    ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])]
+    ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])
+]
 
-_NUMBER_TESTS = [('A 2b van.', ['A', '2b', 'van', '.']),
-                 ('A 2b-ben van.', ['A', '2b-ben', 'van', '.']),
-                 ('A 2b.', ['A', '2b', '.']),
-                 ('A 2b-ben.', ['A', '2b-ben', '.']),
-                 ('A 3.b van.', ['A', '3.b', 'van', '.']),
-                 ('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']),
-                 ('A 3.b.', ['A', '3.b', '.']),
-                 ('A 3.b-ben.', ['A', '3.b-ben', '.']),
-                 ('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']),
-                 ('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']),
-                 ('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']),
-                 ('A 1:35 van.', ['A', '1:35', 'van', '.']),
-                 ('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']),
-                 ('A 1:35-ben.', ['A', '1:35-ben', '.']),
-                 ('A 1.35 van.', ['A', '1.35', 'van', '.']),
-                 ('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']),
-                 ('A 1.35-ben.', ['A', '1.35-ben', '.']),
-                 ('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']),
-                 ('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']),
-                 ('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']),
-                 ('A 10--12 van.', ['A', '10--12', 'van', '.']),
-                 ('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']),
-                 ('A 10--12-ben.', ['A', '10--12-ben', '.']),
-                 ('A 10‐12 van.', ['A', '10‐12', 'van', '.']),
-                 ('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']),
-                 ('A 10‐12-ben.', ['A', '10‐12-ben', '.']),
-                 ('A 10‑12 van.', ['A', '10‑12', 'van', '.']),
-                 ('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']),
-                 ('A 10‑12-ben.', ['A', '10‑12-ben', '.']),
-                 ('A 10‒12 van.', ['A', '10‒12', 'van', '.']),
-                 ('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']),
-                 ('A 10‒12-ben.', ['A', '10‒12-ben', '.']),
-                 ('A 10–12 van.', ['A', '10–12', 'van', '.']),
-                 ('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']),
-                 ('A 10–12-ben.', ['A', '10–12-ben', '.']),
-                 ('A 10—12 van.', ['A', '10—12', 'van', '.']),
-                 ('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']),
-                 ('A 10—12-ben.', ['A', '10—12-ben', '.']),
-                 ('A 10―12 van.', ['A', '10―12', 'van', '.']),
-                 ('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']),
-                 ('A 10―12-ben.', ['A', '10―12-ben', '.']),
-                 ('A -23,12 van.', ['A', '-23,12', 'van', '.']),
-                 ('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']),
-                 ('A -23,12-ben.', ['A', '-23,12-ben', '.']),
-                 ('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']),
-                 ('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']),
-                 ('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']),
-                 ('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']),
-                 ('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']),
-                 ('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']),
-                 ('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']),
-                 ('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']),
-                 ('A C++ van.', ['A', 'C++', 'van', '.']),
-                 ('A C++-ben van.', ['A', 'C++-ben', 'van', '.']),
-                 ('A C++.', ['A', 'C++', '.']),
-                 ('A C++-ben.', ['A', 'C++-ben', '.']),
-                 ('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']),
-                 ('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']),
-                 ('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']),
-                 ('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']),
-                 ('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']),
-                 ('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']),
-                 ('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']),
-                 ('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']),
-                 ('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']),
-                 ('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']),
-                 ('A IV. 12.', ['A', 'IV.', '12.']),
-                 ('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']),
-                 ('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']),
-                 ('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']),
-                 ('A 2003.01.06.', ['A', '2003.01.06.']),
-                 ('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']),
-                 ('A IV.12. van.', ['A', 'IV.12.', 'van', '.']),
-                 ('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']),
-                 ('A IV.12.', ['A', 'IV.12.']),
-                 ('A IV.12-ben.', ['A', 'IV.12-ben', '.']),
-                 ('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']),
-                 ('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']),
-                 ('A 1.1.2.', ['A', '1.1.2.']),
-                 ('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']),
-                 ('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']),
-                 ('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']),
-                 ('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']),
-                 ('A 3,14 van.', ['A', '3,14', 'van', '.']),
-                 ('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']),
-                 ('A 3,14-ben.', ['A', '3,14-ben', '.']),
-                 ('A 3.14 van.', ['A', '3.14', 'van', '.']),
-                 ('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']),
-                 ('A 3.14-ben.', ['A', '3.14-ben', '.']),
-                 ('A 15. van.', ['A', '15.', 'van', '.']),
-                 ('A 15-ben van.', ['A', '15-ben', 'van', '.']),
-                 ('A 15-ben.', ['A', '15-ben', '.']),
-                 ('A 15.-ben van.', ['A', '15.-ben', 'van', '.']),
-                 ('A 15.-ben.', ['A', '15.-ben', '.']),
-                 ('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']),
-                 ('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']),
-                 ('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']),
-                 ('A -0,99% van.', ['A', '-0,99%', 'van', '.']),
-                 ('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']),
-                 ('A -0,99%.', ['A', '-0,99%', '.']),
-                 ('A -0,99%-ben.', ['A', '-0,99%-ben', '.']),
-                 ('A 10--20% van.', ['A', '10--20%', 'van', '.']),
-                 ('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']),
-                 ('A 10--20%.', ['A', '10--20%', '.']),
-                 ('A 10--20%-ben.', ['A', '10--20%-ben', '.']),
-                 ('A 99§ van.', ['A', '99§', 'van', '.']),
-                 ('A 99§-ben van.', ['A', '99§-ben', 'van', '.']),
-                 ('A 99§-ben.', ['A', '99§-ben', '.']),
-                 ('A 10--20§ van.', ['A', '10--20§', 'van', '.']),
-                 ('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']),
-                 ('A 10--20§-ben.', ['A', '10--20§-ben', '.']),
-                 ('A 99° van.', ['A', '99°', 'van', '.']),
-                 ('A 99°-ben van.', ['A', '99°-ben', 'van', '.']),
-                 ('A 99°-ben.', ['A', '99°-ben', '.']),
-                 ('A 10--20° van.', ['A', '10--20°', 'van', '.']),
-                 ('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']),
-                 ('A 10--20°-ben.', ['A', '10--20°-ben', '.']),
-                 ('A °C van.', ['A', '°C', 'van', '.']),
-                 ('A °C-ben van.', ['A', '°C-ben', 'van', '.']),
-                 ('A °C.', ['A', '°C', '.']),
-                 ('A °C-ben.', ['A', '°C-ben', '.']),
-                 ('A 100°C van.', ['A', '100°C', 'van', '.']),
-                 ('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']),
-                 ('A 100°C.', ['A', '100°C', '.']),
-                 ('A 100°C-ben.', ['A', '100°C-ben', '.']),
-                 ('A 800x600 van.', ['A', '800x600', 'van', '.']),
-                 ('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']),
-                 ('A 800x600-ben.', ['A', '800x600-ben', '.']),
-                 ('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']),
-                 ('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']),
-                 ('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']),
-                 ('A 5/J van.', ['A', '5/J', 'van', '.']),
-                 ('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']),
-                 ('A 5/J-ben.', ['A', '5/J-ben', '.']),
-                 ('A 5/J. van.', ['A', '5/J.', 'van', '.']),
-                 ('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']),
-                 ('A 5/J.-ben.', ['A', '5/J.-ben', '.']),
-                 ('A III/1 van.', ['A', 'III/1', 'van', '.']),
-                 ('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']),
-                 ('A III/1-ben.', ['A', 'III/1-ben', '.']),
-                 ('A III/1. van.', ['A', 'III/1.', 'van', '.']),
-                 ('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']),
-                 ('A III/1.-ben.', ['A', 'III/1.-ben', '.']),
-                 ('A III/c van.', ['A', 'III/c', 'van', '.']),
-                 ('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']),
-                 ('A III/c.', ['A', 'III/c', '.']),
-                 ('A III/c-ben.', ['A', 'III/c-ben', '.']),
-                 ('A TU–154 van.', ['A', 'TU–154', 'van', '.']),
-                 ('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']),
-                 ('A TU–154-ben.', ['A', 'TU–154-ben', '.'])]
+NUMBER_TESTS = [
+    ('A 2b van.', ['A', '2b', 'van', '.']),
+    ('A 2b-ben van.', ['A', '2b-ben', 'van', '.']),
+    ('A 2b.', ['A', '2b', '.']),
+    ('A 2b-ben.', ['A', '2b-ben', '.']),
+    ('A 3.b van.', ['A', '3.b', 'van', '.']),
+    ('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']),
+    ('A 3.b.', ['A', '3.b', '.']),
+    ('A 3.b-ben.', ['A', '3.b-ben', '.']),
+    ('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']),
+    ('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']),
+    ('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']),
+    ('A 1:35 van.', ['A', '1:35', 'van', '.']),
+    ('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']),
+    ('A 1:35-ben.', ['A', '1:35-ben', '.']),
+    ('A 1.35 van.', ['A', '1.35', 'van', '.']),
+    ('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']),
+    ('A 1.35-ben.', ['A', '1.35-ben', '.']),
+    ('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']),
+    ('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']),
+    ('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']),
+    ('A 10--12 van.', ['A', '10--12', 'van', '.']),
+    ('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']),
+    ('A 10--12-ben.', ['A', '10--12-ben', '.']),
+    ('A 10‐12 van.', ['A', '10‐12', 'van', '.']),
+    ('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']),
+    ('A 10‐12-ben.', ['A', '10‐12-ben', '.']),
+    ('A 10‑12 van.', ['A', '10‑12', 'van', '.']),
+    ('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']),
+    ('A 10‑12-ben.', ['A', '10‑12-ben', '.']),
+    ('A 10‒12 van.', ['A', '10‒12', 'van', '.']),
+    ('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']),
+    ('A 10‒12-ben.', ['A', '10‒12-ben', '.']),
+    ('A 10–12 van.', ['A', '10–12', 'van', '.']),
+    ('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']),
+    ('A 10–12-ben.', ['A', '10–12-ben', '.']),
+    ('A 10—12 van.', ['A', '10—12', 'van', '.']),
+    ('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']),
+    ('A 10—12-ben.', ['A', '10—12-ben', '.']),
+    ('A 10―12 van.', ['A', '10―12', 'van', '.']),
+    ('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']),
+    ('A 10―12-ben.', ['A', '10―12-ben', '.']),
+    ('A -23,12 van.', ['A', '-23,12', 'van', '.']),
+    ('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']),
+    ('A -23,12-ben.', ['A', '-23,12-ben', '.']),
+    ('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']),
+    ('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']),
+    ('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']),
+    ('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']),
+    ('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']),
+    ('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']),
+    ('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']),
+    ('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']),
+    ('A C++ van.', ['A', 'C++', 'van', '.']),
+    ('A C++-ben van.', ['A', 'C++-ben', 'van', '.']),
+    ('A C++.', ['A', 'C++', '.']),
+    ('A C++-ben.', ['A', 'C++-ben', '.']),
+    ('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']),
+    ('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']),
+    ('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']),
+    ('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']),
+    ('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']),
+    ('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']),
+    ('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']),
+    ('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']),
+    ('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']),
+    ('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']),
+    ('A IV. 12.', ['A', 'IV.', '12.']),
+    ('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']),
+    ('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']),
+    ('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']),
+    ('A 2003.01.06.', ['A', '2003.01.06.']),
+    ('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']),
+    ('A IV.12. van.', ['A', 'IV.12.', 'van', '.']),
+    ('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']),
+    ('A IV.12.', ['A', 'IV.12.']),
+    ('A IV.12-ben.', ['A', 'IV.12-ben', '.']),
+    ('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']),
+    ('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']),
+    ('A 1.1.2.', ['A', '1.1.2.']),
+    ('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']),
+    ('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']),
+    ('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']),
+    ('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']),
+    ('A 3,14 van.', ['A', '3,14', 'van', '.']),
+    ('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']),
+    ('A 3,14-ben.', ['A', '3,14-ben', '.']),
+    ('A 3.14 van.', ['A', '3.14', 'van', '.']),
+    ('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']),
+    ('A 3.14-ben.', ['A', '3.14-ben', '.']),
+    ('A 15. van.', ['A', '15.', 'van', '.']),
+    ('A 15-ben van.', ['A', '15-ben', 'van', '.']),
+    ('A 15-ben.', ['A', '15-ben', '.']),
+    ('A 15.-ben van.', ['A', '15.-ben', 'van', '.']),
+    ('A 15.-ben.', ['A', '15.-ben', '.']),
+    ('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']),
+    ('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']),
+    ('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']),
+    ('A -0,99% van.', ['A', '-0,99%', 'van', '.']),
+    ('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']),
+    ('A -0,99%.', ['A', '-0,99%', '.']),
+    ('A -0,99%-ben.', ['A', '-0,99%-ben', '.']),
+    ('A 10--20% van.', ['A', '10--20%', 'van', '.']),
+    ('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']),
+    ('A 10--20%.', ['A', '10--20%', '.']),
+    ('A 10--20%-ben.', ['A', '10--20%-ben', '.']),
+    ('A 99§ van.', ['A', '99§', 'van', '.']),
+    ('A 99§-ben van.', ['A', '99§-ben', 'van', '.']),
+    ('A 99§-ben.', ['A', '99§-ben', '.']),
+    ('A 10--20§ van.', ['A', '10--20§', 'van', '.']),
+    ('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']),
+    ('A 10--20§-ben.', ['A', '10--20§-ben', '.']),
+    ('A 99° van.', ['A', '99°', 'van', '.']),
+    ('A 99°-ben van.', ['A', '99°-ben', 'van', '.']),
+    ('A 99°-ben.', ['A', '99°-ben', '.']),
+    ('A 10--20° van.', ['A', '10--20°', 'van', '.']),
+    ('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']),
+    ('A 10--20°-ben.', ['A', '10--20°-ben', '.']),
+    ('A °C van.', ['A', '°C', 'van', '.']),
+    ('A °C-ben van.', ['A', '°C-ben', 'van', '.']),
+    ('A °C.', ['A', '°C', '.']),
+    ('A °C-ben.', ['A', '°C-ben', '.']),
+    ('A 100°C van.', ['A', '100°C', 'van', '.']),
+    ('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']),
+    ('A 100°C.', ['A', '100°C', '.']),
+    ('A 100°C-ben.', ['A', '100°C-ben', '.']),
+    ('A 800x600 van.', ['A', '800x600', 'van', '.']),
+    ('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']),
+    ('A 800x600-ben.', ['A', '800x600-ben', '.']),
+    ('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']),
+    ('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']),
+    ('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']),
+    ('A 5/J van.', ['A', '5/J', 'van', '.']),
+    ('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']),
+    ('A 5/J-ben.', ['A', '5/J-ben', '.']),
+    ('A 5/J. van.', ['A', '5/J.', 'van', '.']),
+    ('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']),
+    ('A 5/J.-ben.', ['A', '5/J.-ben', '.']),
+    ('A III/1 van.', ['A', 'III/1', 'van', '.']),
+    ('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']),
+    ('A III/1-ben.', ['A', 'III/1-ben', '.']),
+    ('A III/1. van.', ['A', 'III/1.', 'van', '.']),
+    ('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']),
+    ('A III/1.-ben.', ['A', 'III/1.-ben', '.']),
+    ('A III/c van.', ['A', 'III/c', 'van', '.']),
+    ('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']),
+    ('A III/c.', ['A', 'III/c', '.']),
+    ('A III/c-ben.', ['A', 'III/c-ben', '.']),
+    ('A TU–154 van.', ['A', 'TU–154', 'van', '.']),
+    ('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']),
+    ('A TU–154-ben.', ['A', 'TU–154-ben', '.'])
+]
 
-_QUOTE_TESTS = [('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
-                ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
-                ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']),
-                ('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']),
-                ("A don't van.", ['A', "don't", 'van', '.'])]
+QUOTE_TESTS = [
+    ('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
+    ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
+    ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']),
+    ('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']),
+    ("A don't van.", ['A', "don't", 'van', '.'])
+]
 
-_DOT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
-              ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
-              ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
-              ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
-              ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
-              ('A .hu.', ['A', '.hu', '.']),
-              ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
-              ('A pl.', ['A', 'pl.']),
-              ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
-              ('Egy..ket.', ['Egy', '..', 'ket', '.']),
-              ('Valami... van.', ['Valami', '...', 'van', '.']),
-              ('Valami ...van...', ['Valami', '...', 'van', '...']),
-              ('Valami...', ['Valami', '...']),
-              ('Valami ...', ['Valami', '...']),
-              ('Valami ... más.', ['Valami', '...', 'más', '.'])]
+DOT_TESTS = [
+    ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
+    ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
+    ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
+    ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
+    ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
+    ('A .hu.', ['A', '.hu', '.']),
+    ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
+    ('A pl.', ['A', 'pl.']),
+    ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
+    ('Egy..ket.', ['Egy', '..', 'ket', '.']),
+    ('Valami... van.', ['Valami', '...', 'van', '.']),
+    ('Valami ...van...', ['Valami', '...', 'van', '...']),
+    ('Valami...', ['Valami', '...']),
+    ('Valami ...', ['Valami', '...']),
+    ('Valami ... más.', ['Valami', '...', 'más', '.'])
+]
 
 
-@pytest.fixture(scope="session")
-def HU():
-    return Hungarian()
+TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS # + NUMBER_TESTS + HYPHEN_TESTS
 
 
-@pytest.fixture(scope="module")
-def hu_tokenizer(HU):
-    return HU.tokenizer
-
-
-@pytest.mark.parametrize(("input", "expected_tokens"),
-                         _DEFAULT_TESTS + _HYPHEN_TESTS + _NUMBER_TESTS + _DOT_TESTS + _QUOTE_TESTS)
-def test_testcases(hu_tokenizer, input, expected_tokens):
-    tokens = hu_tokenizer(input)
-    token_list = [token.orth_ for token in tokens if not token.is_space]
+@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
+def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):
+    tokens = hu_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
     assert expected_tokens == token_list
diff --git a/spacy/tests/regression/test_issue351.py b/spacy/tests/regression/test_issue351.py
new file mode 100644
index 000000000..84d4398c5
--- /dev/null
+++ b/spacy/tests/regression/test_issue351.py
@@ -0,0 +1,16 @@
+from __future__ import unicode_literals
+from ...en import English
+
+import pytest
+
+
+@pytest.fixture
+def en_tokenizer():
+    return English.Defaults.create_tokenizer()
+
+
+def test_issue351(en_tokenizer):
+    doc = en_tokenizer("   This is a cat.")
+    assert doc[0].idx == 0
+    assert len(doc[0]) == 3
+    assert doc[1].idx == 3
diff --git a/spacy/tests/regression/test_issue360.py b/spacy/tests/regression/test_issue360.py
new file mode 100644
index 000000000..018289030
--- /dev/null
+++ b/spacy/tests/regression/test_issue360.py
@@ -0,0 +1,14 @@
+from __future__ import unicode_literals
+from ...en import English
+
+import pytest
+
+
+@pytest.fixture
+def en_tokenizer():
+    return English.Defaults.create_tokenizer()
+
+
+def test_big_ellipsis(en_tokenizer):
+    tokens = en_tokenizer(u'$45...............Asking')
+    assert len(tokens) > 2
diff --git a/spacy/tests/sun.tokens b/spacy/tests/sun.tokens
deleted file mode 100644
index 4b912e18e..000000000
--- a/spacy/tests/sun.tokens
+++ /dev/null
@@ -1,4 +0,0 @@
-The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ]
-
-The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ]
-Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ]
diff --git a/spacy/tests/tokenizer/conftest.py b/spacy/tests/tokenizer/conftest.py
index 06ccde7b3..c8e340208 100644
--- a/spacy/tests/tokenizer/conftest.py
+++ b/spacy/tests/tokenizer/conftest.py
@@ -1,7 +1,23 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
 import pytest
-from spacy.en import English
+
+from ...en import English
+from ...de import German
+from ...es import Spanish
+from ...it import Italian
+from ...fr import French
+from ...pt import Portuguese
+from ...nl import Dutch
+from ...sv import Swedish
+from ...hu import Hungarian
 
 
-@pytest.fixture(scope="module")
-def en_tokenizer(EN):
-    return EN.tokenizer
+LANGUAGES = [English, German, Spanish, Italian, French, Dutch, Swedish, Hungarian]
+
+
+@pytest.fixture(params=LANGUAGES)
+def tokenizer(request):
+    lang = request.param
+    return lang.Defaults.create_tokenizer()
diff --git a/spacy/tests/tokenizer/test_contractions.py b/spacy/tests/tokenizer/test_contractions.py
deleted file mode 100644
index 568e34704..000000000
--- a/spacy/tests/tokenizer/test_contractions.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-
-
-def test_possess(en_tokenizer):
-    tokens = en_tokenizer("Mike's")
-    assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike"
-    assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s"
-    assert len(tokens) == 2
-
-
-def test_apostrophe(en_tokenizer):
-    tokens = en_tokenizer("schools'")
-    assert len(tokens) == 2
-    assert tokens[1].orth_ == "'"
-    assert tokens[0].orth_ == "schools"
-
-
-def test_LL(en_tokenizer):
-    tokens = en_tokenizer("we'll")
-    assert len(tokens) == 2
-    assert tokens[1].orth_ == "'ll"
-    assert tokens[1].lemma_ == "will"
-    assert tokens[0].orth_ == "we"
-
-
-def test_aint(en_tokenizer):
-    tokens = en_tokenizer("ain't")
-    assert len(tokens) == 2
-    assert tokens[0].orth_ == "ai"
-    assert tokens[0].lemma_ == "be"
-    assert tokens[1].orth_ == "n't"
-    assert tokens[1].lemma_ == "not"
-
-def test_capitalized(en_tokenizer):
-    tokens = en_tokenizer("can't")
-    assert len(tokens) == 2
-    tokens = en_tokenizer("Can't")
-    assert len(tokens) == 2
-    tokens = en_tokenizer("Ain't")
-    assert len(tokens) == 2
-    assert tokens[0].orth_ == "Ai"
-    assert tokens[0].lemma_ == "be"
-
-
-def test_punct(en_tokenizer):
-    tokens = en_tokenizer("We've")
-    assert len(tokens) == 2
-    tokens = en_tokenizer("``We've")
-    assert len(tokens) == 3
-
-
-@pytest.mark.xfail
-def test_therell(en_tokenizer):
-    tokens = en_tokenizer("there'll")
-    assert len(tokens) == 2
-    assert tokens[0].text == "there"
-    assert tokens[1].text == "there"
diff --git a/spacy/tests/tokenizer/test_emoticons.py b/spacy/tests/tokenizer/test_emoticons.py
deleted file mode 100644
index e0022dbbd..000000000
--- a/spacy/tests/tokenizer/test_emoticons.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-
-
-def test_tweebo_challenge(en_tokenizer):
-    text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
-    tokens = en_tokenizer(text)
-    assert tokens[0].orth_ == ":o"
-    assert tokens[1].orth_ == ":/"
-    assert tokens[2].orth_ == ":'("
-    assert tokens[3].orth_ == ">:o"
-    assert tokens[4].orth_ == "(:"
-    assert tokens[5].orth_ == ":)"
-    assert tokens[6].orth_ == ">.<"
-    assert tokens[7].orth_ == "XD"
-    assert tokens[8].orth_ == "-__-"
-    assert tokens[9].orth_ == "o.O"
-    assert tokens[10].orth_ == ";D"
-    assert tokens[11].orth_ == ":-)"
-    assert tokens[12].orth_ == "@_@"
-    assert tokens[13].orth_ == ":P"
-    assert tokens[14].orth_ == "8D"
-    assert tokens[15].orth_ == ":1"
-    assert tokens[16].orth_ == ">:("
-    assert tokens[17].orth_ == ":D"
-    assert tokens[18].orth_ == "=|"
-    assert tokens[19].orth_ == '")'
-    assert tokens[20].orth_ == ':>'
-    assert tokens[21].orth_ == '....'
-
-
-def test_false_positive(en_tokenizer):
-    text = "example:)"
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 3
diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
new file mode 100644
index 000000000..aab27714e
--- /dev/null
+++ b/spacy/tests/tokenizer/test_exceptions.py
@@ -0,0 +1,41 @@
+# coding: utf-8
+"""Test that tokenizer exceptions and emoticons are handled correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_tokenizer_handles_emoticons(tokenizer):
+    # Tweebo challenge (CMU)
+    text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
+    tokens = tokenizer(text)
+    assert tokens[0].text == ":o"
+    assert tokens[1].text == ":/"
+    assert tokens[2].text == ":'("
+    assert tokens[3].text == ">:o"
+    assert tokens[4].text == "(:"
+    assert tokens[5].text == ":)"
+    assert tokens[6].text == ">.<"
+    assert tokens[7].text == "XD"
+    assert tokens[8].text == "-__-"
+    assert tokens[9].text == "o.O"
+    assert tokens[10].text == ";D"
+    assert tokens[11].text == ":-)"
+    assert tokens[12].text == "@_@"
+    assert tokens[13].text == ":P"
+    assert tokens[14].text == "8D"
+    assert tokens[15].text == ":1"
+    assert tokens[16].text == ">:("
+    assert tokens[17].text == ":D"
+    assert tokens[18].text == "=|"
+    assert tokens[19].text == '")'
+    assert tokens[20].text == ':>'
+    assert tokens[21].text == '....'
+
+
+@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
+def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
+    tokens = tokenizer(text)
+    assert len(tokens) == length
diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py
deleted file mode 100644
index 1b7cbaa7b..000000000
--- a/spacy/tests/tokenizer/test_infix.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from __future__ import unicode_literals
-
-import pytest
-
-def test_hyphen(en_tokenizer):
-    tokens = en_tokenizer('best-known')
-    assert len(tokens) == 3
-
-
-def test_numeric_range(en_tokenizer):
-    tokens = en_tokenizer('0.1-13.5')
-    assert len(tokens) == 3
-
-def test_period(en_tokenizer):
-    tokens = en_tokenizer('best.Known')
-    assert len(tokens) == 3
-    tokens = en_tokenizer('zombo.com')
-    assert len(tokens) == 1
-
-
-def test_ellipsis(en_tokenizer):
-    tokens = en_tokenizer('best...Known')
-    assert len(tokens) == 3
-    tokens = en_tokenizer('best...known')
-    assert len(tokens) == 3
-
-def test_big_ellipsis(en_tokenizer):
-    '''Test regression identified in Issue #360'''
-    tokens = en_tokenizer(u'$45...............Asking')
-    assert len(tokens) > 2
-
-
-
-def test_email(en_tokenizer):
-    tokens = en_tokenizer('hello@example.com')
-    assert len(tokens) == 1
-    tokens = en_tokenizer('hi+there@gmail.it')
-    assert len(tokens) == 1
-
-
-def test_double_hyphen(en_tokenizer):
-    tokens = en_tokenizer(u'No decent--let alone well-bred--people.')
-    assert tokens[0].text == u'No'
-    assert tokens[1].text == u'decent'
-    assert tokens[2].text == u'--'
-    assert tokens[3].text == u'let'
-    assert tokens[4].text == u'alone'
-    assert tokens[5].text == u'well'
-    assert tokens[6].text == u'-'
-    # TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter
-    # on infixes.
-    assert tokens[7].text == u'bred'
-    assert tokens[8].text == u'--'
-    assert tokens[9].text == u'people'
-
-
-def test_infix_comma(en_tokenizer):
-    # Re issue #326
-    tokens = en_tokenizer(u'Hello,world')
-    assert tokens[0].text == u'Hello'
-    assert tokens[1].text == u','
-    assert tokens[2].text == u'world'
diff --git a/spacy/tests/tokenizer/test_only_punct.py b/spacy/tests/tokenizer/test_only_punct.py
deleted file mode 100644
index 12c958088..000000000
--- a/spacy/tests/tokenizer/test_only_punct.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from __future__ import unicode_literals
-
-
-def test_only_pre1(en_tokenizer):
-    assert len(en_tokenizer("(")) == 1
-
-
-def test_only_pre2(en_tokenizer):
-    assert len(en_tokenizer("((")) == 2
diff --git a/spacy/tests/tokenizer/test_post_punct.py b/spacy/tests/tokenizer/test_post_punct.py
deleted file mode 100644
index ff1120c63..000000000
--- a/spacy/tests/tokenizer/test_post_punct.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-
-
-@pytest.fixture
-def close_puncts():
-    return [')', ']', '}', '*']
-
-
-def test_close(close_puncts, en_tokenizer):
-    word_str = 'Hello'
-    for p in close_puncts:
-        string = word_str + p
-        tokens = en_tokenizer(string)
-        assert len(tokens) == 2
-        assert tokens[1].string == p
-        assert tokens[0].string == word_str
-
-
-def test_two_different_close(close_puncts, en_tokenizer):
-    word_str = 'Hello'
-    for p in close_puncts:
-        string = word_str + p + "'"
-        tokens = en_tokenizer(string)
-        assert len(tokens) == 3
-        assert tokens[0].string == word_str
-        assert tokens[1].string == p
-        assert tokens[2].string == "'"
-
-
-def test_three_same_close(close_puncts, en_tokenizer):
-    word_str = 'Hello'
-    for p in close_puncts:
-        string = word_str + p + p + p
-        tokens = en_tokenizer(string)
-        assert len(tokens) == 4
-        assert tokens[0].string == word_str
-        assert tokens[1].string == p
-
-
-def test_double_end_quote(en_tokenizer):
-    assert len(en_tokenizer("Hello''")) == 2
-    assert len(en_tokenizer("''")) == 1
diff --git a/spacy/tests/tokenizer/test_pre_punct.py b/spacy/tests/tokenizer/test_pre_punct.py
deleted file mode 100644
index 9aec1dc7b..000000000
--- a/spacy/tests/tokenizer/test_pre_punct.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from __future__ import unicode_literals
-
-import pytest
-
-
-@pytest.fixture
-def open_puncts():
-    return ['(', '[', '{', '*']
-
-
-def test_open(open_puncts, en_tokenizer):
-    word_str = 'Hello'
-    for p in open_puncts:
-        string = p + word_str
-        tokens = en_tokenizer(string)
-        assert len(tokens) == 2
-        assert tokens[0].orth_ == p
-        assert tokens[1].orth_ == word_str
-
-
-def test_two_different_open(open_puncts, en_tokenizer):
-    word_str = 'Hello'
-    for p in open_puncts:
-        string = p + "`" + word_str
-        tokens = en_tokenizer(string)
-        assert len(tokens) == 3
-        assert tokens[0].orth_ == p
-        assert tokens[1].orth_ == "`"
-        assert tokens[2].orth_ == word_str
-
-
-def test_three_same_open(open_puncts, en_tokenizer):
-    word_str = 'Hello'
-    for p in open_puncts:
-        string = p + p + p + word_str
-        tokens = en_tokenizer(string)
-        assert len(tokens) == 4
-        assert tokens[0].orth_ == p
-        assert tokens[3].orth_ == word_str
-
-
-def test_open_appostrophe(en_tokenizer):
-    string = "'The"
-    tokens = en_tokenizer(string)
-    assert len(tokens) == 2
-    assert tokens[0].orth_ == "'"
diff --git a/spacy/tests/tokenizer/test_special_affix.py b/spacy/tests/tokenizer/test_special_affix.py
deleted file mode 100644
index 62cf114f1..000000000
--- a/spacy/tests/tokenizer/test_special_affix.py
+++ /dev/null
@@ -1,46 +0,0 @@
-"""Test entries in the tokenization special-case interacting with prefix
-and suffix punctuation."""
-from __future__ import unicode_literals
-import pytest
-
-
-def test_no_special(en_tokenizer):
-    assert len(en_tokenizer("(can)")) == 3
-
-
-def test_no_punct(en_tokenizer):
-    assert len(en_tokenizer("can't")) == 2
-
-
-def test_prefix(en_tokenizer):
-    assert len(en_tokenizer("(can't")) == 3
-
-
-def test_suffix(en_tokenizer):
-    assert len(en_tokenizer("can't)")) == 3
-
-
-def test_wrap(en_tokenizer):
-    assert len(en_tokenizer("(can't)")) == 4
-
-
-def test_uneven_wrap(en_tokenizer):
-    assert len(en_tokenizer("(can't?)")) == 5
-
-
-def test_prefix_interact(en_tokenizer):
-    assert len(en_tokenizer("U.S.")) == 1
-    assert len(en_tokenizer("us.")) == 2
-    assert len(en_tokenizer("(U.S.")) == 2
-
-
-def test_suffix_interact(en_tokenizer):
-    assert len(en_tokenizer("U.S.)")) == 2
-
-
-def test_even_wrap_interact(en_tokenizer):
-    assert len(en_tokenizer("(U.S.)")) == 3
-
-
-def test_uneven_wrap_interact(en_tokenizer):
-    assert len(en_tokenizer("(U.S.?)")) == 4
diff --git a/spacy/tests/tokenizer/test_string_loading.py b/spacy/tests/tokenizer/test_string_loading.py
deleted file mode 100644
index 1bc5539bc..000000000
--- a/spacy/tests/tokenizer/test_string_loading.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""Test suspected freeing of strings"""
-from __future__ import unicode_literals
-
-
-def test_one(en_tokenizer):
-    tokens = en_tokenizer('Betty Botter bought a pound of butter.')
-    assert tokens[0].orth_ == 'Betty'
-    tokens2 = en_tokenizer('Betty also bought a pound of butter.')
-    assert tokens2[0].orth_ == 'Betty'
diff --git a/spacy/tests/tokenizer/test_surround_punct.py b/spacy/tests/tokenizer/test_surround_punct.py
deleted file mode 100644
index 7c7a50904..000000000
--- a/spacy/tests/tokenizer/test_surround_punct.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-
-
-@pytest.fixture
-def paired_puncts():
-    return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
-
-
-def test_token(paired_puncts, en_tokenizer):
-    word_str = 'Hello'
-    for open_, close_ in paired_puncts:
-        string = open_ + word_str + close_
-        tokens = en_tokenizer(string)
-        assert len(tokens) == 3
-        assert tokens[0].orth_ == open_
-        assert tokens[1].orth_ == word_str
-        assert tokens[2].orth_ == close_
-
-
-def test_two_different(paired_puncts, en_tokenizer):
-    word_str = 'Hello'
-    for open_, close_ in paired_puncts:
-        string = "`" + open_ + word_str + close_ + "'"
-        tokens = en_tokenizer(string)
-        assert len(tokens) == 5
-        assert tokens[0].orth_ == "`"
-        assert tokens[1].orth_ == open_
-        assert tokens[2].orth_ == word_str
-        assert tokens[2].orth_ == word_str
-        assert tokens[3].orth_ == close_
-        assert tokens[4].orth_ == "'"
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index 091561ae3..9db007d7e 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -1,172 +1,83 @@
 # coding: utf-8
 from __future__ import unicode_literals
+from os import path
 
 import pytest
-import io
-import pickle
-import cloudpickle
-import tempfile
 
-from ... import util
-from ...language_data import TOKENIZER_PREFIXES
+from ...util import utf8open
 
-en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
 
-# @pytest.mark.xfail
-# def test_pickle(en_tokenizer):
-#     file_ = io.BytesIO()
-#     cloudpickle.dump(en_tokenizer, file_)
-#     file_.seek(0)
-#     loaded = pickle.load(file_)
-#     assert loaded is not None
-
-def test_pre_punct_regex():
-    string = "(can't"
-    match = en_search_prefixes(string)
-    assert match.group() == "("
-
-def test_no_word(en_tokenizer):
-    tokens = en_tokenizer(u'')
+def test_tokenizer_handles_no_word(tokenizer):
+    tokens = tokenizer("")
     assert len(tokens) == 0
 
 
-def test_single_word(en_tokenizer):
-    tokens = en_tokenizer(u'hello')
-    assert tokens[0].orth_ == 'hello'
+@pytest.mark.parametrize('text', ["lorem"])
+def test_tokenizer_handles_single_word(tokenizer, text):
+    tokens = tokenizer(text)
+    assert tokens[0].text == text
 
 
-def test_two_words(en_tokenizer):
-    tokens = en_tokenizer('hello possums')
-    assert len(tokens) == 2
-    assert tokens[0].orth_ != tokens[1].orth_
-
-
-def test_punct(en_tokenizer):
-    tokens = en_tokenizer('hello, possums.')
+def test_tokenizer_handles_punct(tokenizer):
+    text = "Lorem, ipsum."
+    tokens = tokenizer(text)
     assert len(tokens) == 4
-    assert tokens[0].orth_ == 'hello'
-    assert tokens[1].orth_ == ','
-    assert tokens[2].orth_ == 'possums'
-    assert tokens[1].orth_ != 'hello'
+    assert tokens[0].text == "Lorem"
+    assert tokens[1].text == ","
+    assert tokens[2].text == "ipsum"
+    assert tokens[1].text != "Lorem"
 
 
-def test_digits(en_tokenizer):
-    tokens = en_tokenizer('The year: 1984.')
-    assert len(tokens) == 5
-    assert tokens[0].orth == en_tokenizer.vocab['The'].orth
-    assert tokens[3].orth == en_tokenizer.vocab['1984'].orth
+def test_tokenizer_handles_digits(tokenizer):
+    exceptions = ["hu"]
+    text = "Lorem ipsum: 1984."
+    tokens = tokenizer(text)
+
+    if tokens[0].lang_ not in exceptions:
+        assert len(tokens) == 5
+        assert tokens[0].text == "Lorem"
+        assert tokens[3].text == "1984"
 
 
-def test_contraction(en_tokenizer):
-    tokens = en_tokenizer("don't giggle")
-    assert len(tokens) == 3
-    assert tokens[1].orth == en_tokenizer.vocab["n't"].orth
-    tokens = en_tokenizer("i said don't!")
-    assert len(tokens) == 5
-    assert tokens[4].orth == en_tokenizer.vocab['!'].orth
-
-def test_contraction_punct(en_tokenizer):
-    tokens = [w.text for w in en_tokenizer("(can't")]
-    assert tokens == ['(', 'ca', "n't"]
-    tokens = en_tokenizer("`ain't")
-    assert len(tokens) == 3
-    tokens = en_tokenizer('''"isn't''')
-    assert len(tokens) == 3
-    tokens = en_tokenizer("can't!")
-    assert len(tokens) == 3
+@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
+def test_tokenizer_keep_urls(tokenizer, text):
+    tokens = tokenizer(text)
+    assert len(tokens) == 1
 
 
-def test_sample(en_tokenizer):
-    text = """Tributes pour in for late British Labour Party leader
+@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
+def test_tokenizer_keeps_email(tokenizer, text):
+    tokens = tokenizer(text)
+    assert len(tokens) == 1
 
-Tributes poured in from around the world Thursday
-to the late Labour Party leader John Smith, who died earlier from a massive
-heart attack aged 55.
 
-In Washington, the US State Department issued a statement regretting "the
-untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
+def test_tokenizer_handles_long_text(tokenizer):
+    text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit
 
-"Mr. Smith, throughout his distinguished"""
+Cras egestas orci non porttitor maximus.
+Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate.
 
-    tokens = en_tokenizer(text)
+Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris.
+
+"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo."""
+
+    tokens = tokenizer(text)
     assert len(tokens) > 5
 
 
-def test_cnts1(en_tokenizer):
-    text = u"""The U.S. Army likes Shock and Awe."""
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 8
+@pytest.mark.parametrize('file_name', ["sun.txt"])
+def test_tokenizer_handle_text_from_file(tokenizer, file_name):
+    loc = path.join(path.dirname(__file__), '..', file_name)
+    text = utf8open(loc).read()
+    assert len(text) != 0
+    tokens = tokenizer(text)
+    assert len(tokens) > 100
 
 
-def test_cnts2(en_tokenizer):
-    text = u"""U.N. regulations are not a part of their concern."""
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 10
-
-
-def test_cnts3(en_tokenizer):
-    text = u"“Isn't it?”"
-    tokens = en_tokenizer(text)
-    words = [t.orth_ for t in tokens]
-    assert len(words) == 6
-
-
-def test_cnts4(en_tokenizer):
-    text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
-    tokens = en_tokenizer(text)
-    words = [t.orth_ for t in tokens]
-    assert len(words) == 15
-
-
-def test_cnts5(en_tokenizer):
-    text = """'Me too!', Mr. P. Delaware cried. """
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 11
-
-
-@pytest.mark.xfail
-def test_mr(en_tokenizer):
-    text = """Today is Tuesday.Mr."""
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 5
-    assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
-
-
-def test_cnts6(en_tokenizer):
-    text = u'They ran about 10km.'
-    tokens = en_tokenizer(text)
-    words = [t.orth_ for t in tokens]
-    assert len(words) == 6
-
-def test_bracket_period(en_tokenizer):
-    text = u'(And a 6a.m. run through Washington Park).'
-    tokens = en_tokenizer(text)
-    assert tokens[len(tokens) - 1].orth_ == u'.'
-
-
-def test_ie(en_tokenizer):
-    text = u"It's mediocre i.e. bad."
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 6
-    assert tokens[3].orth_ == "i.e."
-
-
-def test_two_whitespace(en_tokenizer):
-    orig_str = u'there are 2 spaces after this  '
-    tokens = en_tokenizer(orig_str)
-    assert repr(tokens.text_with_ws) == repr(orig_str)
-
-
-@pytest.mark.xfail
-def test_em_dash_infix(en_tokenizer):
-    # Re Issue #225
-    tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, '''
-                          '''you'll have to walk there.\u2014Ariel.''')
-    assert tokens[6].text == 'Puddleton'
-    assert tokens[7].text == '?'
-    assert tokens[8].text == '\u2014'
-
-#def test_cnts7():
-#    text = 'But then the 6,000-year ice age came...'
-#    tokens = EN.tokenize(text)
-#    assert len(tokens) == 10
+def test_tokenizer_suspected_freeing_strings(tokenizer):
+    text1 = "Lorem dolor sit amet, consectetur adipiscing elit."
+    text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
+    tokens1 = tokenizer(text1)
+    tokens2 = tokenizer(text2)
+    assert tokens1[0].text == "Lorem"
+    assert tokens2[0].text == "Lorem"
diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py
index ad34c8791..7ff3106a8 100644
--- a/spacy/tests/tokenizer/test_whitespace.py
+++ b/spacy/tests/tokenizer/test_whitespace.py
@@ -1,67 +1,51 @@
+# coding: utf-8
 """Test that tokens are created correctly for whitespace."""
+
+
 from __future__ import unicode_literals
 
 import pytest
 
 
-def test_single_space(en_tokenizer):
-    tokens = en_tokenizer('hello possums')
+@pytest.mark.parametrize('text', ["lorem ipsum"])
+def test_tokenizer_splits_single_space(tokenizer, text):
+    tokens = tokenizer(text)
     assert len(tokens) == 2
 
 
-def test_double_space(en_tokenizer):
-    tokens = en_tokenizer('hello  possums')
+@pytest.mark.parametrize('text', ["lorem  ipsum"])
+def test_tokenizer_splits_double_space(tokenizer, text):
+    tokens = tokenizer(text)
     assert len(tokens) == 3
-    assert tokens[1].orth_ == ' '
+    assert tokens[1].text == " "
 
 
-def test_newline(en_tokenizer):
-    tokens = en_tokenizer('hello\npossums')
+@pytest.mark.parametrize('text', ["lorem ipsum  "])
+def test_tokenizer_handles_double_trainling_ws(tokenizer, text):
+    tokens = tokenizer(text)
+    assert repr(tokens.text_with_ws) == repr(text)
+
+
+@pytest.mark.parametrize('text', ["lorem\nipsum"])
+def test_tokenizer_splits_newline(tokenizer, text):
+    tokens = tokenizer(text)
+    assert len(tokens) == 3
+    assert tokens[1].text == "\n"
+
+
+@pytest.mark.parametrize('text', ["lorem \nipsum"])
+def test_tokenizer_splits_newline_space(tokenizer, text):
+    tokens = tokenizer(text)
     assert len(tokens) == 3
 
 
-def test_newline_space(en_tokenizer):
-    tokens = en_tokenizer('hello \npossums')
+@pytest.mark.parametrize('text', ["lorem  \nipsum"])
+def test_tokenizer_splits_newline_double_space(tokenizer, text):
+    tokens = tokenizer(text)
     assert len(tokens) == 3
 
 
-def test_newline_double_space(en_tokenizer):
-    tokens = en_tokenizer('hello  \npossums')
+@pytest.mark.parametrize('text', ["lorem \n ipsum"])
+def test_tokenizer_splits_newline_space_wrap(tokenizer, text):
+    tokens = tokenizer(text)
     assert len(tokens) == 3
-
-
-def test_newline_space_wrap(en_tokenizer):
-    tokens = en_tokenizer('hello \n possums')
-    assert len(tokens) == 3
-
-
-def test_leading_space_offsets(en_tokenizer):
-    '''Issue #351
-    # this works
-
-    text1 = u"This is a cat."
-    a = english_spacy(text1)
-
-    tok0 = list(a.sents)[0][0]
-    print tok0, tok0.idx, text1[tok0.idx]
-
-    tok1 = list(a.sents)[0][1]
-    print tok1, tok1.idx, text1[tok1.idx]
-
-    print "=="
-
-    # this does not work
-
-    text2 = u"   This is a cat."
-    b = english_spacy(text2)
-
-    tok0 = list(b.sents)[0][0]
-print tok0, tok0.idx, text2[tok0.idx]
-
-    tok1 = list(b.sents)[0][1]
-    print tok1, tok1.idx, text2[tok1.idx]
-    '''
-    doc = en_tokenizer(u"   This is a cat.")
-    assert doc[0].idx == 0
-    assert len(doc[0]) == 3
-    assert doc[1].idx == 3
diff --git a/spacy/tests/tokenizer/test_wiki_sun.py b/spacy/tests/tokenizer/test_wiki_sun.py
deleted file mode 100644
index 8d2a6682e..000000000
--- a/spacy/tests/tokenizer/test_wiki_sun.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from __future__ import unicode_literals
-
-from spacy.util import utf8open
-
-import pytest
-from os import path
-
-
-HERE = path.dirname(__file__)
-
-
-@pytest.fixture
-def sun_txt():
-    loc = path.join(HERE, '..', 'sun.txt')
-    return utf8open(loc).read()
-
-
-def test_tokenize(sun_txt, en_tokenizer):
-    assert len(sun_txt) != 0
-    tokens = en_tokenizer(sun_txt)
-    assert len(tokens) > 100
diff --git a/spacy/tests/website/conftest.py b/spacy/tests/website/conftest.py
deleted file mode 100644
index 4f533ae76..000000000
--- a/spacy/tests/website/conftest.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-import os
-
-
-@pytest.fixture(scope='session')
-def nlp():
-    from spacy.en import English
-    if os.environ.get('SPACY_DATA'):
-        data_dir = os.environ.get('SPACY_DATA')
-    else:
-        data_dir = True
-    return English(path=data_dir)
-
-
-@pytest.fixture()
-def doc(nlp):
-    for word in ['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']:
-        _ = nlp.vocab[word]
-    return nlp('Hello, world. Here are two sentences.')
diff --git a/spacy/tests/website/test_api.py b/spacy/tests/website/test_api.py
deleted file mode 100644
index 6a7379d87..000000000
--- a/spacy/tests/website/test_api.py
+++ /dev/null
@@ -1,172 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-from spacy.attrs import HEAD
-import numpy
-
-
-@pytest.mark.xfail
-def test_example_war_and_peace(nlp):
-    # from spacy.en import English
-    from spacy._doc_examples import download_war_and_peace
-
-    unprocessed_unicode = download_war_and_peace()
-
-    # nlp = English()
-    # TODO: ImportError: No module named _doc_examples
-    doc = nlp(unprocessed_unicode)
-
-
-def test_main_entry_point(nlp):
-    # from spacy.en import English
-    # nlp = English()
-    doc = nlp('Some text.') # Applies tagger, parser, entity
-    doc = nlp('Some text.', parse=False) # Applies tagger and entity, not parser
-    doc = nlp('Some text.', entity=False) # Applies tagger and parser, not entity
-    doc = nlp('Some text.', tag=False) # Does not apply tagger, entity or parser
-    doc = nlp('') # Zero-length tokens, not an error
-    # doc = nlp(b'Some text') <-- Error: need unicode
-    doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
-
-
-@pytest.mark.models
-def test_sentence_spans(nlp):
-    # from spacy.en import English
-    # nlp = English()
-    doc = nlp("This is a sentence. Here's another...")
-    assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
-
-
-@pytest.mark.models
-def test_entity_spans(nlp):
-    # from spacy.en import English
-    # nlp = English()
-    tokens = nlp('Mr. Best flew to New York on Saturday morning.')
-    ents = list(tokens.ents)
-    assert ents[0].label == 346
-    assert ents[0].label_ == 'PERSON'
-    assert ents[0].orth_ == 'Best'
-    assert ents[0].string == ents[0].string
-
-
-@pytest.mark.models
-def test_noun_chunk_spans(nlp):
-    # from spacy.en import English
-    # nlp = English()
-    doc = nlp('The sentence in this example has three noun chunks.')
-    for chunk in doc.noun_chunks:
-        print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
-
-    # NP The sentence <-- has
-    # NP this example <-- in
-    # NP three noun chunks <-- has
-
-
-@pytest.mark.models
-def test_count_by(nlp):
-    # from spacy.en import English, attrs
-    # nlp = English()
-    import numpy
-    from spacy import attrs
-    tokens = nlp('apple apple orange banana')
-    assert tokens.count_by(attrs.ORTH) == {3699: 2, 3750: 1, 5965: 1}
-    assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[3699],
-                                                        [3699],
-                                                        [3750],
-                                                        [5965]], dtype=numpy.int32))
-
-@pytest.mark.models
-def test_read_bytes(nlp):
-    from spacy.tokens.doc import Doc
-    loc = 'test_serialize.bin'
-    with open(loc, 'wb') as file_:
-        file_.write(nlp(u'This is a document.').to_bytes())
-        file_.write(nlp(u'This is another.').to_bytes())
-    docs = []
-    with open(loc, 'rb') as file_:
-        for byte_string in Doc.read_bytes(file_):
-            docs.append(Doc(nlp.vocab).from_bytes(byte_string))
-    assert len(docs) == 2
-
-
-def test_token_span(doc):
-    span = doc[4:6]
-    token = span[0]
-    assert token.i == 4
-
-
-@pytest.mark.models
-def test_example_i_like_new_york1(nlp):
-    toks = nlp('I like New York in Autumn.')
-
-
-@pytest.fixture
-def toks(nlp):
-    doc = nlp('I like New York in Autumn.')
-    doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
-    return doc
-
-
-def test_example_i_like_new_york2(toks):
-    i, like, new, york, in_, autumn, dot = range(len(toks))
-
-
-@pytest.fixture
-def tok(toks, tok):
-    i, like, new, york, in_, autumn, dot = range(len(toks))
-    return locals()[tok]
-
-
-@pytest.fixture
-def new(toks):
-    return tok(toks, "new")
-
-
-@pytest.fixture
-def york(toks):
-    return tok(toks, "york")
-
-
-@pytest.fixture
-def autumn(toks):
-    return tok(toks, "autumn")
-
-
-@pytest.fixture
-def dot(toks):
-    return tok(toks, "dot")
-
-
-def test_example_i_like_new_york3(toks, new, york):
-    assert toks[new].head.orth_ == 'York'
-    assert toks[york].head.orth_ == 'like'
-
-
-def test_example_i_like_new_york4(toks, new, york):
-    new_york = toks[new:york+1]
-    assert new_york.root.orth_ == 'York'
-
-
-def test_example_i_like_new_york5(toks, autumn, dot):
-    assert toks[autumn].head.orth_ == 'in'
-    assert toks[dot].head.orth_ == 'like'
-    autumn_dot = toks[autumn:]
-    assert autumn_dot.root.orth_ == 'Autumn'
-
-
-def test_navigating_the_parse_tree_lefts(doc):
-    # TODO: where does the span object come from?
-    span = doc[:2]
-    lefts = [span.doc[i] for i in range(0, span.start)
-             if span.doc[i].head in span]
-
-
-def test_navigating_the_parse_tree_rights(doc):
-    span = doc[:2]
-    rights = [span.doc[i] for i in range(span.end, len(span.doc))
-              if span.doc[i].head in span]
-
-
-def test_string_store(doc):
-    string_store = doc.vocab.strings
-    for i, string in enumerate(string_store):
-        assert i == string_store[string]
diff --git a/spacy/tests/website/test_home.py b/spacy/tests/website/test_home.py
deleted file mode 100644
index 95c0ec3bb..000000000
--- a/spacy/tests/website/test_home.py
+++ /dev/null
@@ -1,180 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-import spacy
-import os
-
-
-try:
-    xrange
-except NameError:
-    xrange = range
-
-
-@pytest.fixture()
-def token(doc):
-    return doc[0]
-
-
-@pytest.mark.models
-def test_load_resources_and_process_text():
-    from spacy.en import English
-    nlp = English()
-    doc = nlp(u'Hello, world. Here are two sentences.')
-
-
-@pytest.mark.models
-def test_get_tokens_and_sentences(doc):
-    token = doc[0]
-    sentence = next(doc.sents)
-    assert token is sentence[0]
-    assert sentence.text == 'Hello, world.'
-
-
-@pytest.mark.models
-def test_use_integer_ids_for_any_strings(nlp, token):
-    hello_id = nlp.vocab.strings['Hello']
-    hello_str = nlp.vocab.strings[hello_id]
-
-    assert token.orth  == hello_id  == 3125
-    assert token.orth_ == hello_str == 'Hello'
-
-
-def test_get_and_set_string_views_and_flags(nlp, token):
-    assert token.shape_ == 'Xxxxx'
-    for lexeme in nlp.vocab:
-        if lexeme.is_alpha:
-            lexeme.shape_ = 'W'
-        elif lexeme.is_digit:
-            lexeme.shape_ = 'D'
-        elif lexeme.is_punct:
-            lexeme.shape_ = 'P'
-        else:
-            lexeme.shape_ = 'M'
-    assert token.shape_ == 'W'
-
-
-def test_export_to_numpy_arrays(nlp, doc):
-    from spacy.attrs import ORTH, LIKE_URL, IS_OOV
-
-    attr_ids = [ORTH, LIKE_URL, IS_OOV]
-    doc_array = doc.to_array(attr_ids)
-    assert doc_array.shape == (len(doc), len(attr_ids))
-    assert doc[0].orth == doc_array[0, 0]
-    assert doc[1].orth == doc_array[1, 0]
-    assert doc[0].like_url == doc_array[0, 1]
-    assert list(doc_array[:, 1]) == [t.like_url for t in doc]
-
-
-@pytest.mark.models
-def test_word_vectors(nlp):
-    doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
-
-    apples = doc[0]
-    oranges = doc[2]
-    boots = doc[6]
-    hippos = doc[8]
-
-    assert apples.similarity(oranges) > boots.similarity(hippos)
-
-
-@pytest.mark.models
-def test_part_of_speech_tags(nlp):
-    from spacy.parts_of_speech import ADV
-
-    def is_adverb(token):
-        return token.pos == spacy.parts_of_speech.ADV
-
-    # These are data-specific, so no constants are provided. You have to look
-    # up the IDs from the StringStore.
-    NNS = nlp.vocab.strings['NNS']
-    NNPS = nlp.vocab.strings['NNPS']
-    def is_plural_noun(token):
-        return token.tag == NNS or token.tag == NNPS
-
-    def print_coarse_pos(token):
-        print(token.pos_)
-
-    def print_fine_pos(token):
-        print(token.tag_)
-
-
-@pytest.mark.models
-def test_syntactic_dependencies():
-    def dependency_labels_to_root(token):
-        '''Walk up the syntactic tree, collecting the arc labels.'''
-        dep_labels = []
-        while token.head is not token:
-            dep_labels.append(token.dep)
-            token = token.head
-        return dep_labels
-
-
-@pytest.mark.models
-def test_named_entities():
-    def iter_products(docs):
-        for doc in docs:
-            for ent in doc.ents:
-                if ent.label_ == 'PRODUCT':
-                    yield ent
-
-    def word_is_in_entity(word):
-        return word.ent_type != 0
-
-    def count_parent_verb_by_person(docs):
-        counts = defaultdict(defaultdict(int))
-        for doc in docs:
-            for ent in doc.ents:
-                if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
-                    counts[ent.orth_][ent.root.head.lemma_] += 1
-        return counts
-
-
-def test_calculate_inline_mark_up_on_original_string():
-    def put_spans_around_tokens(doc, get_classes):
-        '''Given some function to compute class names, put each token in a
-        span element, with the appropriate classes computed.
-
-        All whitespace is preserved, outside of the spans. (Yes, I know HTML
-        won't display it. But the point is no information is lost, so you can
-        calculate what you need, e.g. <br /> tags, <p> tags, etc.)
-        '''
-        output = []
-        template = '<span classes="{classes}">{word}</span>{space}'
-        for token in doc:
-            if token.is_space:
-                output.append(token.orth_)
-            else:
-                output.append(
-                  template.format(
-                    classes=' '.join(get_classes(token)),
-                    word=token.orth_,
-                    space=token.whitespace_))
-        string = ''.join(output)
-        string = string.replace('\n', '')
-        string = string.replace('\t', '    ')
-        return string
-
-
-@pytest.mark.models
-def test_efficient_binary_serialization(doc):
-    from spacy.tokens.doc import Doc
-
-    byte_string = doc.to_bytes()
-    open('moby_dick.bin', 'wb').write(byte_string)
-
-    nlp = spacy.en.English()
-    for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
-       doc = Doc(nlp.vocab)
-       doc.from_bytes(byte_string)
-
-
-@pytest.mark.models
-def test_multithreading(nlp):
-    texts = [u'One document.', u'...', u'Lots of documents']
-    # .pipe streams input, and produces streaming output
-    iter_texts = (texts[i % 3] for i in xrange(100000000))
-    for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
-        assert doc.is_parsed
-        if i == 100:
-            break
-
diff --git a/spacy/util.py b/spacy/util.py
index afed4142e..457534302 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -94,8 +94,13 @@ def read_regex(path):
 
 
 def compile_prefix_regex(entries):
-    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
-    return re.compile(expression)
+    if '(' in entries:
+        # Handle deprecated data
+        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
+        return re.compile(expression)
+    else:
+        expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
+        return re.compile(expression)
 
 
 def compile_suffix_regex(entries):
diff --git a/website/_harp.json b/website/_harp.json
index bc8cf4d84..04a66f772 100644
--- a/website/_harp.json
+++ b/website/_harp.json
@@ -22,7 +22,8 @@
             "twitter": "spacy_io",
             "github": "explosion",
             "reddit": "spacynlp",
-            "codepen": "explosion"
+            "codepen": "explosion",
+            "gitter": "explosion/spaCy"
         },
 
         "NAVIGATION": {
@@ -53,7 +54,7 @@
             }
         },
 
-        "V_CSS": "1.10",
+        "V_CSS": "1.14",
         "V_JS": "1.0",
         "DEFAULT_SYNTAX" : "python",
         "ANALYTICS": "UA-58931649-1",
diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade
index 27f195690..bc8b85557 100644
--- a/website/_includes/_mixins-base.jade
+++ b/website/_includes/_mixins-base.jade
@@ -1,6 +1,7 @@
 //- 💫 MIXINS > BASE
 
 //- Aside wrapper
+    label - [string] aside label
 
 mixin aside-wrapper(label)
     aside.c-aside
@@ -21,6 +22,10 @@ mixin date(input)
 
 
 //- SVG from map
+    file   - [string] SVG file name in /assets/img/
+    name   - [string] SVG symbol id
+    width  - [integer] width in px
+    height - [integer] height in px (default: same as width)
 
 mixin svg(file, name, width, height)
     svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
@@ -28,19 +33,23 @@ mixin svg(file, name, width, height)
 
 
 //- Icon
+    name - [string] icon name, should be SVG symbol ID
+    size - [integer] icon width and height (default: 20)
 
 mixin icon(name, size)
-    +svg("icons", "icon-" + name, size || 20).o-icon&attributes(attributes)
+    +svg("icons", name, size || 20).o-icon&attributes(attributes)
 
 
 //- Pro/Con/Neutral icon
+    icon - [string] "pro", "con" or "neutral" (default: "neutral")
 
 mixin procon(icon)
-    - colors = { pro: "green", con: "red" }
+    - colors = { pro: "green", con: "red", neutral: "yellow" }
     +icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
 
 
 //- Headlines Helper Mixin
+    level - [integer] 1, 2, 3, 4, or 5
 
 mixin headline(level)
     if level == 1
@@ -65,6 +74,7 @@ mixin headline(level)
 
 
 //- Permalink rendering
+    id - [string] permalink ID used for link anchor
 
 mixin permalink(id)
     if id
@@ -77,6 +87,7 @@ mixin permalink(id)
 
 
 //- Terminal-style code window
+    label - [string] title displayed in top bar of terminal window
 
 mixin terminal(label)
     .x-terminal
@@ -87,6 +98,18 @@ mixin terminal(label)
             block
 
 
+//- Gitter chat button and widget
+    button - [string] text shown on button
+    label  - [string] title of chat window (default: same as button)
+
+mixin gitter(button, label)
+    aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
+
+    button.js-gitter-button.c-chat__button.u-text-small
+        +icon("chat").o-icon--inline
+        !=button
+
+
 //- Logo
 
 mixin logo()
diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade
index 8fe24b11b..8a42024c1 100644
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@@ -44,7 +44,7 @@ mixin api(path)
     +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block
         block
 
-        |  #[+icon("book", 18).o-icon--inline.u-help.u-color-subtle]
+        |  #[+icon("book", 18).o-icon--inline.u-color-subtle]
 
 
 //- Aside for text
diff --git a/website/_includes/_page-docs.jade b/website/_includes/_page-docs.jade
index 09cbfa6a5..72db134cd 100644
--- a/website/_includes/_page-docs.jade
+++ b/website/_includes/_page-docs.jade
@@ -24,4 +24,6 @@ main.o-main.o-main--sidebar.o-main--aside
             .o-inline-list
                 +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)]
 
+    +gitter("spaCy chat")
+
     include _footer
diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade
new file mode 100644
index 000000000..544cf0977
--- /dev/null
+++ b/website/_includes/_scripts.jade
@@ -0,0 +1,23 @@
+//- 💫 INCLUDES > SCRIPTS
+
+script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
+script(src="/assets/js/prism.js", type="text/javascript")
+
+if SECTION == "docs"
+    script.
+        ((window.gitter = {}).chat = {}).options = {
+            useStyles: false,
+            activationElement: '.js-gitter-button',
+            targetElement: '.js-gitter',
+            room: '!{SOCIAL.gitter}'
+        };
+
+    script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
+
+if environment == "deploy"
+    script
+        | window.ga=window.ga||function(){
+        | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
+        | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
+
+    script(async src="https://www.google-analytics.com/analytics.js")
diff --git a/website/_layout.jade b/website/_layout.jade
index b04c4b5f3..d5c52df3f 100644
--- a/website/_layout.jade
+++ b/website/_layout.jade
@@ -52,13 +52,4 @@ html(lang="en")
             main!=yield
                 include _includes/_footer
 
-        script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
-        script(src="/assets/js/prism.js", type="text/javascript")
-
-        if environment == "deploy"
-            script
-                | window.ga=window.ga||function(){
-                | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
-                | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
-
-            script(async src="https://www.google-analytics.com/analytics.js")
+        include _includes/_scripts
diff --git a/website/assets/css/_base/_fonts.sass b/website/assets/css/_base/_fonts.sass
index 72aaf97f8..be113798c 100644
--- a/website/assets/css/_base/_fonts.sass
+++ b/website/assets/css/_base/_fonts.sass
@@ -6,36 +6,36 @@
     font-family: "Source Sans Pro"
     font-style: normal
     font-weight: 400
-    src: url("../fonts/sourcesanspro-regular.eot")
-    src: url("../fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-regular.woff2") format("woff2"), url("../fonts/sourcesanspro-regular.woff") format("woff"), url("../fonts/sourcesanspro-regular.ttf") format("truetype"), url("../fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg")
+    src: url("/assets/fonts/sourcesanspro-regular.eot")
+    src: url("/assets/fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-regular.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-regular.woff") format("woff"), url("/assets/fonts/sourcesanspro-regular.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg")
 
 @font-face
     font-family: "Source Sans Pro"
     font-style: italic
     font-weight: 400
-    src: url("../fonts/sourcesanspro-italic.eot")
-    src: url("../fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-italic.woff2") format("woff2"), url("../fonts/sourcesanspro-italic.woff") format("woff"), url("../fonts/sourcesanspro-italic.ttf") format("truetype"), url("../fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg")
+    src: url("/assets/fonts/sourcesanspro-italic.eot")
+    src: url("/assets/fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-italic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-italic.woff") format("woff"), url("/assets/fonts/sourcesanspro-italic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg")
 
 @font-face
     font-family: "Source Sans Pro"
     font-style: normal
     font-weight: 700
-    src: url("../fonts/sourcesanspro-bold.eot")
-    src: url("../fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bold.woff2") format("woff2"), url("../fonts/sourcesanspro-bold.woff") format("woff"), url("../fonts/sourcesanspro-bold.ttf") format("truetype"), url("../fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg")
+    src: url("/assets/fonts/sourcesanspro-bold.eot")
+    src: url("/assets/fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bold.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bold.woff") format("woff"), url("/assets/fonts/sourcesanspro-bold.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg")
 
 @font-face
     font-family: "Source Sans Pro"
     font-style: italic
     font-weight: 700
-    src: url("../fonts/sourcesanspro-bolditalic.eot")
-    src: url("../fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("../fonts/sourcesanspro-bolditalic.woff") format("woff"), url("../fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("../fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg")
+    src: url("/assets/fonts/sourcesanspro-bolditalic.eot")
+    src: url("/assets/fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bolditalic.woff") format("woff"), url("/assets/fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg")
 
 
 //  Source Code Pro
 
 @font-face
-  font-family: "Source Code Pro"
-  font-style: normal
-  font-weight: 600
-  src: url("../fonts/sourcecodepro-semibold.eot")
-  src: url("../fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcecodepro-semibold.woff") format("woff"), url("../fonts/sourcecodepro-semibold.ttf") format("truetype"), url("../fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg")
+    font-family: "Source Code Pro"
+    font-style: normal
+    font-weight: 600
+    src: url("/assets/fonts/sourcecodepro-semibold.eot")
+    src: url("/assets/fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcecodepro-semibold.woff") format("woff"), url("/assets/fonts/sourcecodepro-semibold.ttf") format("truetype"), url("/assets/fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg")
diff --git a/website/assets/css/_base/_objects.sass b/website/assets/css/_base/_objects.sass
index 2b037dca7..7aaaef787 100644
--- a/website/assets/css/_base/_objects.sass
+++ b/website/assets/css/_base/_objects.sass
@@ -60,7 +60,7 @@
     background: $color-back
     border-radius: 2px
     border: 1px solid $color-subtle
-    padding: 3.5% 2.5%
+    padding: 3rem 2.5%
 
 //- Icons
 
diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass
index 95be81bcd..2c40858a8 100644
--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@@ -141,12 +141,6 @@
     background: $pattern
 
 
-//- Cursors
-
-.u-help
-    cursor: help
-
-
 //- Hidden elements
 
 .u-hidden
diff --git a/website/assets/css/_components/_chat.sass b/website/assets/css/_components/_chat.sass
new file mode 100644
index 000000000..2a1e5cc3d
--- /dev/null
+++ b/website/assets/css/_components/_chat.sass
@@ -0,0 +1,100 @@
+//- 💫 CSS > COMPONENTS > CHAT
+
+.c-chat
+    @include position(fixed, top, left, 0, 60%)
+    bottom: 0
+    right: 0
+    display: flex
+    flex-flow: column nowrap
+    background: $color-back
+    transition: transform 0.3s cubic-bezier(0.16, 0.22, 0.22, 1.7)
+    box-shadow: -0.25rem 0 1rem 0 rgba($color-front, 0.25)
+    z-index: 100
+
+    @include breakpoint(min, md)
+        left: calc(100% - #{$aside-width} - #{$aside-padding})
+
+    @include breakpoint(max, sm)
+        left: 50%
+
+    @include breakpoint(max, xs)
+        left: 0
+
+    &.is-collapsed:not(.is-loading)
+        transform: translateX(110%)
+
+    &:before
+        @include position(absolute, top, left, 1rem, 2rem)
+        content: attr(data-title)
+        font: bold 1.4rem $font-code
+        text-transform: uppercase
+        color: $color-back
+
+    &:after
+        @include position(absolute, top, left, 0, 100%)
+        content: ""
+        z-index: -1
+        bottom: 0
+        right: -100%
+        background: $color-back
+
+    & > iframe
+        width: 100%
+        flex: 1 1 calc(100% - #{$nav-height})
+        border: 0
+
+    .gitter-chat-embed-loading-wrapper
+        @include position(absolute, top, left, 0, 0)
+        right: 0
+        bottom: 0
+        display: none
+        justify-content: center
+        align-items: center
+
+        .is-loading &
+            display: flex
+
+    .gitter-chat-embed-action-bar,
+    .gitter-chat-embed-action-bar-item
+        display: flex
+
+    .gitter-chat-embed-action-bar
+        align-items: center
+        justify-content: flex-end
+        background: $color-theme
+        padding: 0 1rem 0 2rem
+        flex: 0 0 $nav-height
+
+    .gitter-chat-embed-action-bar-item
+        @include size(40px)
+        padding: 0
+        opacity: 0.75
+        background-position: 50%
+        background-repeat: no-repeat
+        background-size: 22px 22px
+        border: 0
+        cursor: pointer
+        transition: all 0.2s ease
+
+        &:focus,
+        &:hover
+            opacity: 1
+
+        &.gitter-chat-embed-action-bar-item-pop-out
+            background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyMCIgaGVpZ2h0PSIyMCIgdmlld0JveD0iMCAwIDIwIDIwIj48cGF0aCBmaWxsPSIjZmZmIiBkPSJNMTYgMmgtOC4wMjFjLTEuMDk5IDAtMS45NzkgMC44OC0xLjk3OSAxLjk4djguMDIwYzAgMS4xIDAuOSAyIDIgMmg4YzEuMSAwIDItMC45IDItMnYtOGMwLTEuMS0wLjktMi0yLTJ6TTE2IDEyaC04di04aDh2OHpNNCAxMGgtMnY2YzAgMS4xIDAuOSAyIDIgMmg2di0yaC02di02eiI+PC9wYXRoPjwvc3ZnPg==)
+            margin-right: -4px
+
+        &.gitter-chat-embed-action-bar-item-collapse-chat
+            background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0Ij48cGF0aCBmaWxsPSIjZmZmIiBkPSJNMTguOTg0IDYuNDIybC01LjU3OCA1LjU3OCA1LjU3OCA1LjU3OC0xLjQwNiAxLjQwNi01LjU3OC01LjU3OC01LjU3OCA1LjU3OC0xLjQwNi0xLjQwNiA1LjU3OC01LjU3OC01LjU3OC01LjU3OCAxLjQwNi0xLjQwNiA1LjU3OCA1LjU3OCA1LjU3OC01LjU3OHoiPjwvcGF0aD48L3N2Zz4=)
+
+.c-chat__button
+    @include position(fixed, bottom, right, 0, 2rem)
+    padding: 1rem 1.5rem
+    background: $color-front
+    color: $color-back
+    border-top-left-radius: 4px
+    border-top-right-radius: 4px
+    z-index: 20
+    border-color: $color-theme
+    border-style: solid
+    border-width: 1px 1px 0 1px
diff --git a/website/assets/css/style.sass b/website/assets/css/style.sass
index 5ab135ab9..a8d2edad4 100644
--- a/website/assets/css/style.sass
+++ b/website/assets/css/style.sass
@@ -24,6 +24,7 @@ $theme: blue !default
 
 @import _components/asides
 @import _components/buttons
+@import _components/chat
 @import _components/code
 @import _components/landing
 @import _components/lists
diff --git a/website/assets/img/graphics.svg b/website/assets/img/graphics.svg
index 23036f4ca..dc69deda4 100644
--- a/website/assets/img/graphics.svg
+++ b/website/assets/img/graphics.svg
@@ -64,5 +64,6 @@
         <symbol id="matt-signature" viewBox="0 0 500 250">
             <title>matt-signature</title>
             <path fill="currentColor" d="M18.6 207c-.3-18.8-.8-37.5-1.4-56.2-.6-18.7-1-37.5-1-56.2v-7.2c0-3.5 0-7 .2-11v-18c.8-2.7 1.8-5 3-6.5 1.6-2 3.6-3 6.4-3 3 0 5.4 1 7.6 2 2.2 2 4 4 5.3 6l36.6 71 1.8 3c1 1 2 3 3 3h1l1 1 1-3 22-76c2-3 3-5 4-8l2-9c1-3 2-6 4-8 1-3 4-5 7-7h2c5 0 8 1 10 4 3 2 4 5 5 9 1 3 2 7 1 12v11l1 7c0 3 0 7 1 12 0 4 1 9 1 14l1 14.2 1 12 .6 6v1l1 7.5 1 11.6 1.4 12 1.4 8 1 4 1.7 5.5 1.7 6c.7 1.7 1 3 1.5 3.6-.5 4-1.5 7-3 9-1 2-4 3-8 3h-6l-3-3c-1-1.4-2-2.3-2-3l-4-14-7.6-58V88c0-3.5-1-7-2-10l-2 1.7-18 74v6c0 2-.2 4-1 6 0 2-1 3.5-3 5-1 1.3-3 2-5 2.2-1 0-2 0-3-1l-3.4-2-3-3c-1-1-1.7-2-2-3l-35-52-5.3-10.6v22c0 10.2.2 20.3.6 30.2.4 10 .6 20 .6 30.2v22c0 2-1 4-3 5.4s-3 3-5 3c-3 0-5 0-7-1-1-1-3-3-4-5zm205-63.2c-1.6 2.7-3.4 6-5.3 9.8l-6.2 12.2c-2 4.3-4 8.6-7 13-2 4.2-5 8.2-8 11.7s-5 6.6-9 9c-3 2.5-6 4-9 4.4-1 0-3-1-4-1l-5-2c-1-1-3-2-4-3s-1-3-1-5c1-18 2-33 4-47s6-27 11-38 12-20 20-27 18-12 29-15l2-1h2c5 0 9 2 11 7s4 12 5 23c1 10 2 24 2 40 1 16 2 36 3 59l1 4v5c0 2.6-1 4.5-2 6s-3 2-5 2c-5 0-8-1.7-10-4s-3-6.6-4-11v-4l-1-9s-1-6.7-1-10l-1-8.5v-1l-.2-6-1-7-.5-8.6-1-1zM218 93.5c-4.7 3.4-9.2 8-13.6 13.7-4.4 5.8-7.5 11.3-9.4 16.8-.8 2.5-1.8 6-2.8 10.4-1 4.4-2 8.8-2.7 13l-2 12-.7 7c.2 0 .4-.2.6-.5l.6-1c10.5-10 18-21 22.2-33 4.6-12 7-25 7.7-39zm72 47c-2.3 0-4.4.6-6.2 1.8-2 1.2-4 1.8-6.6 1.8h-5.4c-.7-1-1.4-1-2.3-2l-2.5-2c-.8 0-1.6-1-2.2-2-.6-1-1-2-1-3 0-2 1-4 3-6 2-1 4.5-3 7.2-4l8.3-3s5-2 6.7-3v-11c0-12-.6-25-1.8-38-1.2-12-1.8-25-1.8-37 0-3 .8-6 2.5-7 1-1 4-1 6-1 3 0 6 1 7 3s2 4 3 7c0 3 1 6 1 9v20l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2.3 9-3.4 14-3.4 3 0 6 1 7 3.5s3 5 3 8c0 2-1 4-3 5l-6 3-46 17-1.5 1s-1 0-1.5 1v8c0 6 0 12 .5 18s1 12.3 2 18.3l3 15c1 5 1.4 10 1.4 15 0 1.4-.6 3.5-1.6 6s-2 4-4.7 4c-5 0-8.7-1.6-11.6-4-3-3-4.3-6.6-4.6-11l-2.2-29-2.7-30h-1zm112 0c-2.4 0-4.5.6-6.3 1.8-2 1.2-4 1.8-6.6 1.8h-5c0-1-1-1-2-2l-2-2c-1 0-1-1-2-2 0-1-1-2-1-3 0-2 1-4 3-6 2-1 5-3 7-4l8-3s5-2 7-3v-11c0-12 0-25-2-38-1-12-1-25-1-37 0-3 1-6 3-7s4-1 7-1c4 0 6 1 8 3s3 4 3 7c1 3 1 6 1 9s0 6 1 8v11l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2 9-4 14-4 4 0 6 1 8 4s3 5 3 8c0 2-1 4-2 5l-5.3 3-49 13.8-1.5 1s-1 .5-1.5 1V157l1 18.3c0 5 1 10 2 15s1 10 1 15c0 1.5-1 3.6-2 6s-3 4-5 4c-5 0-9-1.5-12-4.2s-5-6-5-11l-3-28.3-3-30.3h-1z"/>
+        </symbol>
     </defs>
 </svg>
diff --git a/website/assets/img/icons.svg b/website/assets/img/icons.svg
index 9237c9994..224224084 100644
--- a/website/assets/img/icons.svg
+++ b/website/assets/img/icons.svg
@@ -1,32 +1,28 @@
 <svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
     <defs>
-        <symbol id="icon-github" viewBox="0 0 27 32">
-            <title>github</title>
-            <path class="path1" d="M13.714 2.286q3.732 0 6.884 1.839t4.991 4.991 1.839 6.884q0 4.482-2.616 8.063t-6.759 4.955q-0.482 0.089-0.714-0.125t-0.232-0.536q0-0.054 0.009-1.366t0.009-2.402q0-1.732-0.929-2.536 1.018-0.107 1.83-0.321t1.679-0.696 1.446-1.188 0.946-1.875 0.366-2.688q0-2.125-1.411-3.679 0.661-1.625-0.143-3.643-0.5-0.161-1.446 0.196t-1.643 0.786l-0.679 0.429q-1.661-0.464-3.429-0.464t-3.429 0.464q-0.286-0.196-0.759-0.482t-1.491-0.688-1.518-0.241q-0.804 2.018-0.143 3.643-1.411 1.554-1.411 3.679 0 1.518 0.366 2.679t0.938 1.875 1.438 1.196 1.679 0.696 1.83 0.321q-0.696 0.643-0.875 1.839-0.375 0.179-0.804 0.268t-1.018 0.089-1.17-0.384-0.991-1.116q-0.339-0.571-0.866-0.929t-0.884-0.429l-0.357-0.054q-0.375 0-0.518 0.080t-0.089 0.205 0.161 0.25 0.232 0.214l0.125 0.089q0.393 0.179 0.777 0.679t0.563 0.911l0.179 0.411q0.232 0.679 0.786 1.098t1.196 0.536 1.241 0.125 0.991-0.063l0.411-0.071q0 0.679 0.009 1.58t0.009 0.973q0 0.321-0.232 0.536t-0.714 0.125q-4.143-1.375-6.759-4.955t-2.616-8.063q0-3.732 1.839-6.884t4.991-4.991 6.884-1.839zM5.196 21.982q0.054-0.125-0.125-0.214-0.179-0.054-0.232 0.036-0.054 0.125 0.125 0.214 0.161 0.107 0.232-0.036zM5.75 22.589q0.125-0.089-0.036-0.286-0.179-0.161-0.286-0.054-0.125 0.089 0.036 0.286 0.179 0.179 0.286 0.054zM6.286 23.393q0.161-0.125 0-0.339-0.143-0.232-0.304-0.107-0.161 0.089 0 0.321t0.304 0.125zM7.036 24.143q0.143-0.143-0.071-0.339-0.214-0.214-0.357-0.054-0.161 0.143 0.071 0.339 0.214 0.214 0.357 0.054zM8.054 24.589q0.054-0.196-0.232-0.286-0.268-0.071-0.339 0.125t0.232 0.268q0.268 0.107 0.339-0.107zM9.179 24.679q0-0.232-0.304-0.196-0.286 0-0.286 0.196 0 0.232 0.304 0.196 0.286 0 0.286-0.196zM10.214 24.5q-0.036-0.196-0.321-0.161-0.286 0.054-0.25 0.268t0.321 0.143 0.25-0.25z"></path>
+        <symbol id="github" viewBox="0 0 27 32">
+            <path d="M13.714 2.286q3.732 0 6.884 1.839t4.991 4.991 1.839 6.884q0 4.482-2.616 8.063t-6.759 4.955q-0.482 0.089-0.714-0.125t-0.232-0.536q0-0.054 0.009-1.366t0.009-2.402q0-1.732-0.929-2.536 1.018-0.107 1.83-0.321t1.679-0.696 1.446-1.188 0.946-1.875 0.366-2.688q0-2.125-1.411-3.679 0.661-1.625-0.143-3.643-0.5-0.161-1.446 0.196t-1.643 0.786l-0.679 0.429q-1.661-0.464-3.429-0.464t-3.429 0.464q-0.286-0.196-0.759-0.482t-1.491-0.688-1.518-0.241q-0.804 2.018-0.143 3.643-1.411 1.554-1.411 3.679 0 1.518 0.366 2.679t0.938 1.875 1.438 1.196 1.679 0.696 1.83 0.321q-0.696 0.643-0.875 1.839-0.375 0.179-0.804 0.268t-1.018 0.089-1.17-0.384-0.991-1.116q-0.339-0.571-0.866-0.929t-0.884-0.429l-0.357-0.054q-0.375 0-0.518 0.080t-0.089 0.205 0.161 0.25 0.232 0.214l0.125 0.089q0.393 0.179 0.777 0.679t0.563 0.911l0.179 0.411q0.232 0.679 0.786 1.098t1.196 0.536 1.241 0.125 0.991-0.063l0.411-0.071q0 0.679 0.009 1.58t0.009 0.973q0 0.321-0.232 0.536t-0.714 0.125q-4.143-1.375-6.759-4.955t-2.616-8.063q0-3.732 1.839-6.884t4.991-4.991 6.884-1.839zM5.196 21.982q0.054-0.125-0.125-0.214-0.179-0.054-0.232 0.036-0.054 0.125 0.125 0.214 0.161 0.107 0.232-0.036zM5.75 22.589q0.125-0.089-0.036-0.286-0.179-0.161-0.286-0.054-0.125 0.089 0.036 0.286 0.179 0.179 0.286 0.054zM6.286 23.393q0.161-0.125 0-0.339-0.143-0.232-0.304-0.107-0.161 0.089 0 0.321t0.304 0.125zM7.036 24.143q0.143-0.143-0.071-0.339-0.214-0.214-0.357-0.054-0.161 0.143 0.071 0.339 0.214 0.214 0.357 0.054zM8.054 24.589q0.054-0.196-0.232-0.286-0.268-0.071-0.339 0.125t0.232 0.268q0.268 0.107 0.339-0.107zM9.179 24.679q0-0.232-0.304-0.196-0.286 0-0.286 0.196 0 0.232 0.304 0.196 0.286 0 0.286-0.196zM10.214 24.5q-0.036-0.196-0.321-0.161-0.286 0.054-0.25 0.268t0.321 0.143 0.25-0.25z"></path>
         </symbol>
-        <symbol id="icon-code" viewBox="0 0 20 20">
-            <title>code</title>
-            <path class="path1" d="M5.719 14.75c-0.236 0-0.474-0.083-0.664-0.252l-5.060-4.498 5.341-4.748c0.412-0.365 1.044-0.33 1.411 0.083s0.33 1.045-0.083 1.412l-3.659 3.253 3.378 3.002c0.413 0.367 0.45 0.999 0.083 1.412-0.197 0.223-0.472 0.336-0.747 0.336zM14.664 14.748l5.341-4.748-5.060-4.498c-0.413-0.367-1.045-0.33-1.411 0.083s-0.33 1.045 0.083 1.412l3.378 3.003-3.659 3.252c-0.413 0.367-0.45 0.999-0.083 1.412 0.197 0.223 0.472 0.336 0.747 0.336 0.236 0 0.474-0.083 0.664-0.252zM9.986 16.165l2-12c0.091-0.545-0.277-1.060-0.822-1.151-0.547-0.092-1.061 0.277-1.15 0.822l-2 12c-0.091 0.545 0.277 1.060 0.822 1.151 0.056 0.009 0.11 0.013 0.165 0.013 0.48 0 0.904-0.347 0.985-0.835z"></path>
+        <symbol id="code" viewBox="0 0 20 20">
+            <path d="M5.719 14.75c-0.236 0-0.474-0.083-0.664-0.252l-5.060-4.498 5.341-4.748c0.412-0.365 1.044-0.33 1.411 0.083s0.33 1.045-0.083 1.412l-3.659 3.253 3.378 3.002c0.413 0.367 0.45 0.999 0.083 1.412-0.197 0.223-0.472 0.336-0.747 0.336zM14.664 14.748l5.341-4.748-5.060-4.498c-0.413-0.367-1.045-0.33-1.411 0.083s-0.33 1.045 0.083 1.412l3.378 3.003-3.659 3.252c-0.413 0.367-0.45 0.999-0.083 1.412 0.197 0.223 0.472 0.336 0.747 0.336 0.236 0 0.474-0.083 0.664-0.252zM9.986 16.165l2-12c0.091-0.545-0.277-1.060-0.822-1.151-0.547-0.092-1.061 0.277-1.15 0.822l-2 12c-0.091 0.545 0.277 1.060 0.822 1.151 0.056 0.009 0.11 0.013 0.165 0.013 0.48 0 0.904-0.347 0.985-0.835z"></path>
         </symbol>
-        <symbol id="icon-anchor" viewBox="0 0 16 16">
-            <title>anchor</title>
-            <path class="path1" d="M14.779 12.779c-1.471 1.993-4.031 3.245-6.779 3.221-2.748 0.023-5.309-1.229-6.779-3.221l-1.221 1.221v-4h4l-1.1 1.099c0.882 1.46 2.357 2.509 4.1 2.807v-6.047c-1.723-0.446-3-1.997-3-3.858 0-2.209 1.791-4 4-4s4 1.791 4 4c0 1.862-1.277 3.413-3 3.858v6.047c1.742-0.297 3.218-1.347 4.099-2.807l-1.1-1.099h4v4l-1.221-1.221zM10 4c0-1.104-0.895-2-2-2s-2 0.895-2 2c0 1.104 0.895 2 2 2s2-0.896 2-2z"></path>
+        <symbol id="anchor" viewBox="0 0 16 16">
+            <path d="M14.779 12.779c-1.471 1.993-4.031 3.245-6.779 3.221-2.748 0.023-5.309-1.229-6.779-3.221l-1.221 1.221v-4h4l-1.1 1.099c0.882 1.46 2.357 2.509 4.1 2.807v-6.047c-1.723-0.446-3-1.997-3-3.858 0-2.209 1.791-4 4-4s4 1.791 4 4c0 1.862-1.277 3.413-3 3.858v6.047c1.742-0.297 3.218-1.347 4.099-2.807l-1.1-1.099h4v4l-1.221-1.221zM10 4c0-1.104-0.895-2-2-2s-2 0.895-2 2c0 1.104 0.895 2 2 2s2-0.896 2-2z"></path>
         </symbol>
-        <symbol id="icon-book" viewBox="0 0 24 24">
-            <title>book</title>
-            <path class="path1" d="M18.984 6.984v-1.969h-9.984v1.969h9.984zM15 15v-2.016h-6v2.016h6zM18.984 11.016v-2.016h-9.984v2.016h9.984zM20.016 2.016c1.078 0 1.969 0.891 1.969 1.969v12c0 1.078-0.891 2.016-1.969 2.016h-12c-1.078 0-2.016-0.938-2.016-2.016v-12c0-1.078 0.938-1.969 2.016-1.969h12zM3.984 6v14.016h14.016v1.969h-14.016c-1.078 0-1.969-0.891-1.969-1.969v-14.016h1.969z"></path>
+        <symbol id="book" viewBox="0 0 24 24">
+            <path d="M18.984 6.984v-1.969h-9.984v1.969h9.984zM15 15v-2.016h-6v2.016h6zM18.984 11.016v-2.016h-9.984v2.016h9.984zM20.016 2.016c1.078 0 1.969 0.891 1.969 1.969v12c0 1.078-0.891 2.016-1.969 2.016h-12c-1.078 0-2.016-0.938-2.016-2.016v-12c0-1.078 0.938-1.969 2.016-1.969h12zM3.984 6v14.016h14.016v1.969h-14.016c-1.078 0-1.969-0.891-1.969-1.969v-14.016h1.969z"></path>
         </symbol>
-        <symbol id="icon-pro" viewBox="0 0 20 20">
-            <title>pro</title>
-            <path class="path1" d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-4v4h-2v-4h-4v-2h4v-4h2v4h4v2z"></path>
+        <symbol id="pro" viewBox="0 0 20 20">
+            <path d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-4v4h-2v-4h-4v-2h4v-4h2v4h4v2z"></path>
         </symbol>
-        <symbol id="icon-con" viewBox="0 0 20 20">
-            <title>con</title>
-            <path class="path1" d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-10v-2h10v2z"></path>
+        <symbol id="con" viewBox="0 0 20 20">
+            <path d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-10v-2h10v2z"></path>
         </symbol>
-        <symbol id="icon-neutral" viewBox="0 0 20 20">
-            <title>neutral</title>
-            <path class="path1" d="M9.999 0.8c-5.081 0-9.199 4.119-9.199 9.201 0 5.080 4.118 9.199 9.199 9.199s9.2-4.119 9.2-9.199c0-5.082-4.119-9.201-9.2-9.201zM10 13.001c-1.657 0-3-1.344-3-3s1.343-3 3-3c1.656 0 3 1.344 3 3s-1.344 3-3 3z"></path>
+        <symbol id="neutral" viewBox="0 0 20 20">
+            <path d="M9.999 0.8c-5.081 0-9.199 4.119-9.199 9.201 0 5.080 4.118 9.199 9.199 9.199s9.2-4.119 9.2-9.199c0-5.082-4.119-9.201-9.2-9.201zM10 13.001c-1.657 0-3-1.344-3-3s1.343-3 3-3c1.656 0 3 1.344 3 3s-1.344 3-3 3z"></path>
+        </symbol>
+        <symbol id="chat" viewBox="0 0 24 24">
+            <path d="M18 8.016v-2.016h-12v2.016h12zM18 11.016v-2.016h-12v2.016h12zM18 14.016v-2.016h-12v2.016h12zM21.984 3.984v18l-3.984-3.984h-14.016c-1.078 0-1.969-0.938-1.969-2.016v-12c0-1.078 0.891-1.969 1.969-1.969h16.031c1.078 0 1.969 0.891 1.969 1.969z"></path>
         </symbol>
     </defs>
 </svg>
diff --git a/website/docs/api/index.jade b/website/docs/api/index.jade
index 20995df2e..24f3d4458 100644
--- a/website/docs/api/index.jade
+++ b/website/docs/api/index.jade
@@ -23,7 +23,7 @@ p
 
     +row
         +cell Multi-language support
-        each icon in [ "con", "pro", "pro", "pro" ]
+        each icon in [ "neutral", "pro", "pro", "pro" ]
             +cell.u-text-center #[+procon(icon)]
 
     +row
diff --git a/website/docs/index.jade b/website/docs/index.jade
index d2949b8c4..c19602002 100644
--- a/website/docs/index.jade
+++ b/website/docs/index.jade
@@ -2,8 +2,6 @@
 
 include ../_includes/_mixins
 
-p=lorem_short
-
 +aside("Help us improve the docs")
     |  Did you spot a mistake or come across explanations that
     |  are unclear? You can find a "Suggest edits" button at the
diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade
index 4b62a290b..a96df5694 100644
--- a/website/docs/usage/entity-recognition.jade
+++ b/website/docs/usage/entity-recognition.jade
@@ -57,7 +57,7 @@ p
     doc.ents = [Span(0, 1, label='GPE')]
     assert doc[0].ent_type_ == 'GPE'
     doc.ents = []
-    doc.ents = [(u'LondonCity', 0, 1, u'GPE')]
+    doc.ents = [(u'LondonCity', u'GPE', 0, 1)]
 
 p
     |  The value you assign should be a sequence, the values of which
diff --git a/website/docs/usage/resources.jade b/website/docs/usage/resources.jade
index a09c7358d..2b80ebe48 100644
--- a/website/docs/usage/resources.jade
+++ b/website/docs/usage/resources.jade
@@ -30,6 +30,13 @@ p Many of the associated tools and resources that we're developing alongside spa
         +cell
             |  REST microservices for spaCy demos and visualisers.
 
+    +row
+        +cell
+            +src(gh("spacy-notebooks")) spaCy Notebooks
+
+        +cell
+            |  Jupyter notebooks for spaCy examples and tutorials.
+
 +h(2, "libraries") Libraries and projects
 +table(["Name", "Description"])
     +row
diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade
index bedadb0d3..fde9ee4d7 100644
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@@ -141,7 +141,7 @@ p
             span.merge(label=label, tag='NNP' if label else span.root.tag_)
 
     matcher.add_entity('GoogleNow', on_match=merge_phrases)
-    matcher.add_pattern('GoogleNow', {ORTH: 'Google'}, {ORTH: 'Now'}])
+    matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
     doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded'])
     matcher(doc)
     print([w.text for w in doc])