Merge pull request #692 from explosion/organize-language-data

Reorganize language data
2025-11-11 05:19:52 +03:00 · 2016-12-18 17:38:42 +01:00 · 2016-12-18 17:38:42 +01:00 · 5dfeefcb25
commit 5dfeefcb25
parent 3dded56ae1 813249f826
114 changed files with 4603 additions and 3047813 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -1,229 +0,0 @@
 """Set up a model directory.
 Requires:
    lang_data --- Rules for the tokenizer
        * prefix.txt
        * suffix.txt
        * infix.txt
        * morphs.json
        * specials.json
    corpora --- Data files
        * WordNet
        * words.sgt.prob --- Smoothed unigram probabilities
        * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
        * vectors.bz2 --- output of something like word2vec, compressed with bzip
 """
 from __future__ import unicode_literals
 from ast import literal_eval
 import math
 import gzip
 import json
 import plac
 from pathlib import Path
 from shutil import copyfile
 from shutil import copytree
 from collections import defaultdict
 import io
 from spacy.vocab import Vocab
 from spacy.vocab import write_binary_vectors
 from spacy.strings import hash_string
 from preshed.counter import PreshCounter
 from spacy.parts_of_speech import NOUN, VERB, ADJ
 from spacy.util import get_lang_class
 try:
    unicode
 except NameError:
    unicode = str
 def setup_tokenizer(lang_data_dir, tok_dir):
    if not tok_dir.exists():
        tok_dir.mkdir()
    for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json',
                     'suffix.txt'):
        src = lang_data_dir / filename
        dst = tok_dir / filename
        copyfile(str(src), str(dst))
 def _read_clusters(loc):
    if not loc.exists():
        print("Warning: Clusters file not found")
        return {}
    clusters = {}
    for line in io.open(str(loc), 'r', encoding='utf8'):
        try:
            cluster, word, freq = line.split()
        except ValueError:
            continue
        # If the clusterer has only seen the word a few times, its cluster is
        # unreliable.
        if int(freq) >= 3:
            clusters[word] = cluster
        else:
            clusters[word] = '0'
    # Expand clusters with re-casing
    for word, cluster in list(clusters.items()):
        if word.lower() not in clusters:
            clusters[word.lower()] = cluster
        if word.title() not in clusters:
            clusters[word.title()] = cluster
        if word.upper() not in clusters:
            clusters[word.upper()] = cluster
    return clusters
 def _read_probs(loc):
    if not loc.exists():
        print("Probabilities file not found. Trying freqs.")
        return {}, 0.0
    probs = {}
    for i, line in enumerate(io.open(str(loc), 'r', encoding='utf8')):
        prob, word = line.split()
        prob = float(prob)
        probs[word] = prob
    return probs, probs['-OOV-']
 def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
    if not loc.exists():
        print("Warning: Frequencies file not found")
        return {}, 0.0
    counts = PreshCounter()
    total = 0
    if str(loc).endswith('gz'):
        file_ = gzip.open(str(loc))
    else:
        file_ = loc.open()
    for i, line in enumerate(file_):
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        freq = int(freq)
        counts.inc(i+1, freq)
        total += freq
    counts.smooth()
    log_total = math.log(total)
    if str(loc).endswith('gz'):
        file_ = gzip.open(str(loc))
    else:
        file_ = loc.open()
    probs = {}
    for line in file_:
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
            word = literal_eval(key)
            smooth_count = counts.smoother(int(freq))
            probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob
 def _read_senses(loc):
    lexicon = defaultdict(lambda: defaultdict(list))
    if not loc.exists():
        print("Warning: WordNet senses not found")
        return lexicon
    sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
    pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
    for line in codecs.open(str(loc), 'r', 'utf8'):
        sense_strings = line.split()
        word = sense_strings.pop(0)
        for sense in sense_strings:
            pos, sense = sense[3:].split('.')
            sense_name = '%s_%s' % (pos[0].upper(), sense.lower())
            if sense_name != 'N_tops':
                sense_id = sense_names[sense_name]
                lexicon[word][pos_ids[pos]].append(sense_id)
    return lexicon
 def setup_vocab(lex_attr_getters, tag_map, src_dir, dst_dir):
    if not dst_dir.exists():
        dst_dir.mkdir()
    print('Reading vocab from ', src_dir)
    vectors_src = src_dir / 'vectors.bz2'
    if vectors_src.exists():
        write_binary_vectors(vectors_src.as_posix(), (dst_dir / 'vec.bin').as_posix())
    else:
        print("Warning: Word vectors file not found")
    vocab = Vocab(lex_attr_getters=lex_attr_getters, tag_map=tag_map)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
    if not probs:
        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz')
    if not probs:
        oov_prob = -20
    else:
        oov_prob = min(probs.values())
    for word in clusters:
        if word not in probs:
            probs[word] = oov_prob
    lexicon = []
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        # First encode the strings into the StringStore. This way, we can map
        # the orth IDs to frequency ranks
        orth = vocab.strings[word]
    # Now actually load the vocab
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        lexeme = vocab[word]
        lexeme.prob = prob
        lexeme.is_oov = False
        # Decode as a little-endian string, so that we can do & 15 to get
        # the first 4 bits. See _parse_features.pyx
        if word in clusters:
            lexeme.cluster = int(clusters[word][::-1], 2)
        else:
            lexeme.cluster = 0
    vocab.dump((dst_dir / 'lexemes.bin').as_posix())
    with (dst_dir / 'strings.json').open('w') as file_:
        vocab.strings.dump(file_)
    with (dst_dir / 'oov_prob').open('w') as file_:
        file_.write('%f' % oov_prob)
 def main(lang_id, lang_data_dir, corpora_dir, model_dir):
    model_dir = Path(model_dir)
    lang_data_dir = Path(lang_data_dir) / lang_id
    corpora_dir = Path(corpora_dir) / lang_id
    assert corpora_dir.exists()
    assert lang_data_dir.exists()
    if not model_dir.exists():
        model_dir.mkdir()
    tag_map = json.load((lang_data_dir / 'tag_map.json').open())
    setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
    setup_vocab(get_lang_class(lang_id).Defaults.lex_attr_getters, tag_map, corpora_dir,
                model_dir / 'vocab')
    if (lang_data_dir / 'gazetteer.json').exists():
        copyfile((lang_data_dir / 'gazetteer.json').as_posix(),
                 (model_dir / 'vocab' / 'gazetteer.json').as_posix())
    copyfile((lang_data_dir / 'tag_map.json').as_posix(),
             (model_dir / 'vocab' / 'tag_map.json').as_posix())
    if (lang_data_dir / 'lemma_rules.json').exists():
        copyfile((lang_data_dir / 'lemma_rules.json').as_posix(),
                 (model_dir / 'vocab' / 'lemma_rules.json').as_posix())
    if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists():
        copytree((corpora_dir / 'wordnet' / 'dict').as_posix(),
                 (model_dir / 'wordnet').as_posix())
 if __name__ == '__main__':
    plac.call(main)
--- a/fabfile.py
+++ b/fabfile.py
@ -13,134 +13,6 @@ PWD = path.dirname(__file__)
 VENV_DIR = path.join(PWD, '.env')
 def counts():
    pass
    # Tokenize the corpus
    # tokenize() 
    # get_freqs()
    # Collate the counts
    # cat freqs | sort -k2 | gather_freqs()
    # gather_freqs()
    # smooth()
 # clean, make, sdist
 # cd to new env, install from sdist, 
 # Push changes to server
 # Pull changes on server
 # clean make init model
 # test --vectors --slow
 # train
 # test --vectors --slow --models
 # sdist
 # upload data to server
 # change to clean venv
 # py2: install from sdist, test --slow, download data, test --models --vectors
 # py3: install from sdist, test --slow, download data, test --models --vectors
 def prebuild(build_dir='/tmp/build_spacy'):
    if file_exists(build_dir):
        shutil.rmtree(build_dir)
    os.mkdir(build_dir)
    spacy_dir = path.dirname(__file__)
    wn_url = 'http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz'
    build_venv = path.join(build_dir, '.env')
    with lcd(build_dir):
        local('git clone %s .' % spacy_dir)
        local('virtualenv ' + build_venv)
        with prefix('cd %s && PYTHONPATH=`pwd` && . %s/bin/activate' % (build_dir, build_venv)):
            local('pip install cython fabric fabtools pytest')
            local('pip install --no-cache-dir -r requirements.txt')
            local('fab clean make')
            local('cp -r %s/corpora/en/wordnet corpora/en/' % spacy_dir)
            local('PYTHONPATH=`pwd` python bin/init_model.py en lang_data corpora spacy/en/data')
            local('PYTHONPATH=`pwd` fab test')
            local('PYTHONPATH=`pwd` python -m spacy.en.download --force all')
            local('PYTHONPATH=`pwd` py.test --models spacy/tests/')
 def web():
    def jade(source_name, out_dir):
        pwd = path.join(path.dirname(__file__), 'website')
        jade_loc = path.join(pwd, 'src', 'jade', source_name)
        out_loc = path.join(pwd, 'site', out_dir)
        local('jade -P %s --out %s' % (jade_loc, out_loc))
    with virtualenv(VENV_DIR):
        local('./website/create_code_samples spacy/tests/website/ website/src/code/')
    jade('404.jade', '')
    jade('home/index.jade', '')
    jade('docs/index.jade', 'docs/')
    jade('blog/index.jade', 'blog/')
    for collection in ('blog', 'tutorials'):
        for post_dir in (Path(__file__).parent / 'website' / 'src' / 'jade' / collection).iterdir():
            if post_dir.is_dir() \
            and (post_dir / 'index.jade').exists() \
            and (post_dir / 'meta.jade').exists():
                jade(str(post_dir / 'index.jade'), path.join(collection, post_dir.parts[-1]))
 def web_publish(assets_path):
    from boto.s3.connection import S3Connection, OrdinaryCallingFormat
    site_path = 'website/site'
    os.environ['S3_USE_SIGV4'] = 'True'
    conn = S3Connection(host='s3.eu-central-1.amazonaws.com',
                        calling_format=OrdinaryCallingFormat())
    bucket = conn.get_bucket('spacy.io', validate=False)
    keys_left = set([k.name for k in bucket.list()
                     if not k.name.startswith('resources')])
    for root, dirnames, filenames in os.walk(site_path):
        for dirname in dirnames:
            target = os.path.relpath(os.path.join(root, dirname), site_path)
            source = os.path.join(target, 'index.html')
            if os.path.exists(os.path.join(root, dirname, 'index.html')):
                key = bucket.new_key(source)
                key.set_redirect('//%s/%s' % (bucket.name, target))
                print('adding redirect for %s' % target)
                keys_left.remove(source)
        for filename in filenames:
            source = os.path.join(root, filename)
            target = os.path.relpath(root, site_path)
            if target == '.':
                target = filename
            elif filename != 'index.html':
                target = os.path.join(target, filename)
            key = bucket.new_key(target)
            key.set_metadata('Content-Type', 'text/html')
            key.set_contents_from_filename(source)
            print('uploading %s' % target)
            keys_left.remove(target)
    for key_name in keys_left:
        print('deleting %s' % key_name)
        bucket.delete_key(key_name)
    local('aws s3 sync --delete %s s3://spacy.io/resources' % assets_path)
 def publish(version):
    with virtualenv(VENV_DIR):
        local('git push origin master')
        local('git tag -a %s' % version)
        local('git push origin %s' % version)
        local('python setup.py sdist')
        local('python setup.py register')
        local('twine upload dist/spacy-%s.tar.gz' % version)
 def env(lang="python2.7"):
    if file_exists('.env'):
        local('rm -rf .env')
@ -172,38 +44,3 @@ def test():
    with virtualenv(VENV_DIR):
        with lcd(path.dirname(__file__)):
            local('py.test -x spacy/tests')
 def train(json_dir=None, dev_loc=None, model_dir=None):
    if json_dir is None:
        json_dir = 'corpora/en/json'
    if model_dir is None:
        model_dir = 'models/en/'
    with virtualenv(VENV_DIR):
        with lcd(path.dirname(__file__)):
            local('python bin/init_model.py en lang_data/ corpora/ ' + model_dir)
            local('python bin/parser/train.py -p en %s/train/ %s/development %s' % (json_dir, json_dir, model_dir))
 def travis():
    local('open https://travis-ci.org/honnibal/thinc')
 def pos():
    with virtualenv(VENV_DIR):
        local('python tools/train.py ~/work_data/docparse/wsj02-21.conll ~/work_data/docparse/wsj22.conll spacy/en/data')
        local('python tools/tag.py ~/work_data/docparse/wsj22.raw /tmp/tmp')
        local('python tools/eval_pos.py ~/work_data/docparse/wsj22.conll /tmp/tmp')
 def ner():
    local('rm -rf data/en/ner')
    local('python tools/train_ner.py ~/work_data/docparse/wsj02-21.conll data/en/ner')
    local('python tools/tag_ner.py ~/work_data/docparse/wsj22.raw /tmp/tmp')
    local('python tools/eval_ner.py ~/work_data/docparse/wsj22.conll /tmp/tmp | tail')
 def conll():
    local('rm -rf data/en/ner')
    local('python tools/conll03_train.py ~/work_data/ner/conll2003/eng.train data/en/ner/')
    local('python tools/conll03_eval.py ~/work_data/ner/conll2003/eng.testa')
--- a/lang_data/de/abbrev.de.tab
+++ b/lang_data/de/abbrev.de.tab
@ -1,319 +0,0 @@
 # surface form lemma pos 
 # multiple values are separated by |
 # empty lines and lines starting with # are being ignored 
 ''	''
 \")	\")
 \n	\n	<nl>	SP
 \t	\t	<tab>	SP
 	 	<space>	SP
 # example: Wie geht's?
 's	's	es
 'S	'S	es
 # example: Haste mal 'nen Euro?
 'n	'n	ein
 'ne	'ne	eine
 'nen	'nen	einen
 # example: Kommen S’ nur herein!
 s'	s'	sie
 S'	S'	sie
 # example: Da haben wir's!
 ich's	ich|'s	ich|es
 du's	du|'s	du|es
 er's	er|'s	er|es
 sie's	sie|'s	sie|es
 wir's	wir|'s	wir|es
 ihr's	ihr|'s	ihr|es
 # example: Die katze auf'm dach.
 auf'm	auf|'m	auf|dem
 unter'm	unter|'m	unter|dem
 über'm	über|'m	über|dem
 vor'm	vor|'m	vor|dem
 hinter'm	hinter|'m	hinter|dem
 # persons
 B.A.	B.A.
 B.Sc.	B.Sc.
 Dipl.	Dipl.
 Dipl.-Ing.	Dipl.-Ing.
 Dr.	Dr.
 Fr.	Fr.
 Frl.	Frl.
 Hr.	Hr.
 Hrn.	Hrn.
 Frl.	Frl.
 Prof.	Prof.
 St.	St.
 Hrgs.	Hrgs.
 Hg.	Hg.
 a.Z.	a.Z.
 a.D.	a.D.
 h.c.	h.c.
 Jr.	Jr.
 jr.	jr.
 jun.	jun.
 sen.	sen.
 rer.	rer.
 Ing.	Ing.
 M.A.	M.A.
 Mr.	Mr.
 M.Sc.	M.Sc.
 nat.	nat.
 phil.	phil.
 # companies
 Co.	Co.
 co.	co.
 Cie.	Cie.
 A.G.	A.G.
 G.m.b.H.	G.m.b.H.
 i.G.	i.G.
 e.V.	e.V.
 # popular german abbreviations
 Abb.	Abb.
 Abk.	Abk.
 Abs.	Abs.
 Abt.	Abt.
 abzgl.	abzgl.
 allg.	allg.
 a.M.	a.M.
 Bd.	Bd.
 betr.	betr.
 Betr.	Betr.
 Biol.	Biol.
 biol.	biol.
 Bf.	Bf.
 Bhf.	Bhf.
 Bsp.	Bsp.
 bspw.	bspw.
 bzgl.	bzgl.
 bzw.	bzw.
 d.h.	d.h.
 dgl.	dgl.
 ebd.	ebd.
 ehem.	ehem.
 eigtl.	eigtl.
 entspr.	entspr.
 erm.	erm.
 ev.	ev.
 evtl.	evtl.
 Fa.	Fa.
 Fam.	Fam.
 geb.	geb.
 Gebr.	Gebr.
 gem.	gem.
 ggf.	ggf.
 ggü.	ggü.
 ggfs.	ggfs.
 gegr.	gegr.
 Hbf.	Hbf.
 Hrsg.	Hrsg.
 hrsg.	hrsg.
 i.A.	i.A.
 i.d.R.	i.d.R.
 inkl.	inkl.
 insb.	insb.
 i.O.	i.O.
 i.Tr.	i.Tr.
 i.V.	i.V.
 jur.	jur.
 kath.	kath.
 K.O.	K.O.
 lt.	lt.
 max.	max.
 m.E.	m.E.
 m.M.	m.M.
 mtl.	mtl.
 min.	min.
 mind.	mind.
 MwSt.	MwSt.
 Nr.	Nr.
 o.a.	o.a.
 o.ä.	o.ä.
 o.Ä.	o.Ä.
 o.g.	o.g.
 o.k.	o.k.
 O.K.	O.K.
 Orig.	Orig.
 orig.	orig.
 pers.	pers.
 Pkt.	Pkt.
 Red.	Red.
 röm.	röm.
 s.o.	s.o.
 sog.	sog.
 std.	std.
 stellv.	stellv.
 Str.	Str.
 tägl.	tägl.
 Tel.	Tel.
 u.a.	u.a.
 usf.	usf.
 u.s.w.	u.s.w.
 usw.	usw.
 u.U.	u.U.
 u.v.m.	u.v.m.
 uvm.	uvm.
 v.a.	v.a.
 vgl.	vgl.
 vllt.	vllt.
 v.l.n.r.	v.l.n.r.
 vlt.	vlt.
 Vol.	Vol.
 wiss.	wiss.
 Univ.	Univ.
 z.B.	z.B.
 z.b.	z.b.
 z.Bsp.	z.Bsp.
 z.T.	z.T.
 z.Z.	z.Z.
 zzgl.	zzgl.
 z.Zt.	z.Zt.
 # popular latin abbreviations
 vs.	vs.
 adv.	adv.
 Chr.	Chr.
 A.C.	A.C.
 A.D.	A.D.
 e.g.	e.g.
 i.e.	i.e.
 al.	al.
 p.a.	p.a.
 P.S.	P.S.
 q.e.d.	q.e.d.
 R.I.P.	R.I.P.
 etc.	etc.
 incl.	incl.
 ca.	ca.
 n.Chr.	n.Chr.
 p.s.	p.s.
 v.Chr.	v.Chr.
 # popular english abbreviations
 D.C.	D.C.
 N.Y.	N.Y.
 N.Y.C.	N.Y.C.
 U.S.	U.S.
 U.S.A.	U.S.A.
 L.A.	L.A.
 U.S.S.	U.S.S.
 # dates & time
 Jan.	Jan.
 Feb.	Feb.
 Mrz.	Mrz.
 Mär.	Mär.
 Apr.	Apr.
 Jun.	Jun.
 Jul.	Jul.
 Aug.	Aug.
 Sep.	Sep.
 Sept.	Sept.
 Okt.	Okt.
 Nov.	Nov.
 Dez.	Dez.
 Mo.	Mo.
 Di.	Di.
 Mi.	Mi.
 Do.	Do.
 Fr.	Fr.
 Sa.	Sa.
 So.	So.
 Std.	Std.
 Jh.	Jh.
 Jhd.	Jhd.
 # numbers
 Tsd.	Tsd.
 Mio.	Mio.
 Mrd.	Mrd.
 # countries & languages
 engl.	engl.
 frz.	frz.
 lat.	lat.
 österr.	österr.
 # smileys
 :)	:)
 <3	<3
 ;)	;)
 (:	(:
 :(	:(
 -_-	-_-
 =)	=)
 :/	:/
 :>	:>
 ;-)	;-)
 :Y	:Y
 :P	:P
 :-P	:-P
 :3	:3
 =3	=3
 xD	xD
 ^_^	^_^
 =]	=]
 =D	=D
 <333	<333
 :))	:))
 :0	:0
 -__-	-__-
 xDD	xDD
 o_o	o_o
 o_O	o_O
 V_V	V_V
 =[[	=[[
 <33	<33
 ;p	;p
 ;D	;D
 ;-p	;-p
 ;(	;(
 :p	:p
 :]	:]
 :O	:O
 :-/	:-/
 :-)	:-)
 :(((	:(((
 :((	:((
 :')	:')
 (^_^)	(^_^)
 (=	(=
 o.O	o.O
 # single letters
 a.	a.
 b.	b.
 c.	c.
 d.	d.
 e.	e.
 f.	f.
 g.	g.
 h.	h.
 i.	i.
 j.	j.
 k.	k.
 l.	l.
 m.	m.
 n.	n.
 o.	o.
 p.	p.
 q.	q.
 r.	r.
 s.	s.
 t.	t.
 u.	u.
 v.	v.
 w.	w.
 x.	x.
 y.	y.
 z.	z.
 ä.	ä.
 ö.	ö.
 ü.	ü.
--- a/lang_data/de/gazetteer.json
+++ b/lang_data/de/gazetteer.json
@ -1,194 +0,0 @@
 {
 	"Reddit": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "reddit"}]
 		]
 	],
 	"SeptemberElevenAttacks": [
 		"EVENT",
 		{},
 		[
 			[
 				{"orth": "9/11"}
 			],
 			[
 				{"lower": "september"},
 				{"orth": "11"}
 			]
 		]
 	],
 	"Linux": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "linux"}]
 		]
 	],
 	"Haskell": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "haskell"}]
 		]
 	],
 	"HaskellCurry": [
 		"PERSON",
 		{},
 		[
 			[
 				{"lower": "haskell"},
 				{"lower": "curry"}
 			]
 		]
 	],
 	"Javascript": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "javascript"}]
 		]
 	],
 	"CSS": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "css"}],
 			[{"lower": "css3"}]
 		]
 	],
 	"displaCy": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "displacy"}]
 		]
 	],
 	"spaCy": [
 		"PRODUCT",
 		{},
 		[
 			[{"orth": "spaCy"}]
 		]
 	],
    "HTML": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "html"}],
 			[{"lower": "html5"}]
 		]
 	],
    "Python": [
        "PRODUCT",
        {},
        [
            [{"orth": "Python"}]
        ]
    ],
    "Ruby": [
        "PRODUCT",
        {},
        [
            [{"orth": "Ruby"}]
        ]
    ],
    "Digg": [
        "PRODUCT",
        {},
        [
            [{"lower": "digg"}]
        ]
    ],
     "FoxNews": [
        "ORG",
        {},
        [
            [{"orth": "Fox"}],
            [{"orth": "News"}]
        ]
    ],
    "Google": [
        "ORG",
        {},
        [
            [{"lower": "google"}]
        ]
    ],
    "Mac": [
        "PRODUCT",
        {},
        [
            [{"lower": "mac"}]
        ]
    ],
    "Wikipedia": [
        "PRODUCT",
        {},
        [
            [{"lower": "wikipedia"}]
        ]
    ],
    "Windows": [
        "PRODUCT",
        {},
        [
            [{"orth": "Windows"}]
        ]
    ],
     "Dell": [
        "ORG",
        {},
        [
            [{"lower": "dell"}]
        ]
    ],
    "Facebook": [
        "ORG",
        {},
        [
            [{"lower": "facebook"}]
        ]
    ],
     "Blizzard": [
        "ORG",
        {},
        [
            [{"orth": "Blizzard"}]
        ]
    ],
    "Ubuntu": [
        "ORG",
        {},
        [
            [{"orth": "Ubuntu"}]
        ]
    ],
    "Youtube": [
        "PRODUCT",
        {},
        [
            [{"lower": "youtube"}]
        ]
    ],
    "false_positives": [
        null,
        {},
        [
            [{"orth": "Shit"}],
            [{"orth": "Weed"}],
            [{"orth": "Cool"}],
            [{"orth": "Btw"}],
            [{"orth": "Bah"}],
            [{"orth": "Bullshit"}],
            [{"orth": "Lol"}],
            [{"orth": "Yo"}, {"lower": "dawg"}],
            [{"orth": "Yay"}],
            [{"orth": "Ahh"}],
            [{"orth": "Yea"}],
            [{"orth": "Bah"}]
        ]
    ]
 }
--- a/lang_data/de/generate_specials.py
+++ b/lang_data/de/generate_specials.py
@ -1,334 +0,0 @@
 # coding=utf8
 import json
 import io
 import itertools
 contractions = {}
 # contains the lemmas, parts of speech, number, and tenspect of
 # potential tokens generated after splitting contractions off
 token_properties = {}
 # contains starting tokens with their potential contractions
 # each potential contraction has a list of exceptions
    # lower - don't generate the lowercase version
    # upper - don't generate the uppercase version
    # contrLower - don't generate the lowercase version with apostrophe (') removed
    # contrUpper - dont' generate the uppercase version with apostrophe (') removed
 # for example, we don't want to create the word "hell" or "Hell" from "he" + "'ll" so 
 # we add "contrLower" and "contrUpper" to the exceptions list
 starting_tokens = {}
 # other specials that don't really have contractions
 # so they are hardcoded
 hardcoded_specials = {
                "''": [{"F": "''"}],
                "\")": [{"F": "\")"}],
                "\n": [{"F": "\n", "pos": "SP"}],
                "\t": [{"F": "\t", "pos": "SP"}],
                " ": [{"F": " ", "pos": "SP"}],
                # example: Wie geht's?
                "'s":  [{"F": "'s", "L": "es"}],
                "'S":  [{"F": "'S", "L": "es"}],
                # example: Haste mal 'nen Euro?
                "'n":  [{"F": "'n", "L": "ein"}],
                "'ne":  [{"F": "'ne", "L": "eine"}],
                "'nen":  [{"F": "'nen", "L": "einen"}],
                # example: Kommen S’ nur herein!
                "s'":  [{"F": "s'", "L": "sie"}],
                "S'":  [{"F": "S'", "L": "sie"}],
                # example: Da haben wir's!
                "ich's":  [{"F": "ich"}, {"F": "'s", "L": "es"}],
                "du's":  [{"F": "du"}, {"F": "'s", "L": "es"}],
                "er's":  [{"F": "er"}, {"F": "'s", "L": "es"}],
                "sie's":  [{"F": "sie"}, {"F": "'s", "L": "es"}],
                "wir's":  [{"F": "wir"}, {"F": "'s", "L": "es"}],
                "ihr's":  [{"F": "ihr"}, {"F": "'s", "L": "es"}],
                # example: Die katze auf'm dach.
                "auf'm":  [{"F": "auf"}, {"F": "'m", "L": "dem"}],
                "unter'm":  [{"F": "unter"}, {"F": "'m", "L": "dem"}],
                "über'm":  [{"F": "über"}, {"F": "'m", "L": "dem"}],
                "vor'm":  [{"F": "vor"}, {"F": "'m", "L": "dem"}],
                "hinter'm":  [{"F": "hinter"}, {"F": "'m", "L": "dem"}],
                # persons
                "Fr.": [{"F": "Fr."}],
                "Hr.": [{"F": "Hr."}],
                "Frl.": [{"F": "Frl."}],
                "Prof.": [{"F": "Prof."}],
                "Dr.": [{"F": "Dr."}],
                "St.": [{"F": "St."}],
                "Hrgs.": [{"F": "Hrgs."}],
                "Hg.": [{"F": "Hg."}],
                "a.Z.": [{"F": "a.Z."}],
                "a.D.": [{"F": "a.D."}],
                "A.D.": [{"F": "A.D."}],
                "h.c.": [{"F": "h.c."}],
                "jun.": [{"F": "jun."}],
                "sen.": [{"F": "sen."}],
                "rer.": [{"F": "rer."}],
                "Dipl.": [{"F": "Dipl."}],
                "Ing.": [{"F": "Ing."}],
                "Dipl.-Ing.": [{"F": "Dipl.-Ing."}],
                # companies
                "Co.": [{"F": "Co."}],
                "co.": [{"F": "co."}],
                "Cie.": [{"F": "Cie."}],
                "A.G.": [{"F": "A.G."}],
                "G.m.b.H.": [{"F": "G.m.b.H."}],
                "i.G.": [{"F": "i.G."}],
                "e.V.": [{"F": "e.V."}],
                # popular german abbreviations
                "ggü.": [{"F": "ggü."}],
                "ggf.": [{"F": "ggf."}],
                "ggfs.": [{"F": "ggfs."}],
                "Gebr.": [{"F": "Gebr."}],
                "geb.": [{"F": "geb."}],
                "gegr.": [{"F": "gegr."}],
                "erm.": [{"F": "erm."}],
                "engl.": [{"F": "engl."}],
                "ehem.": [{"F": "ehem."}],
                "Biol.": [{"F": "Biol."}],
                "biol.": [{"F": "biol."}],
                "Abk.": [{"F": "Abk."}],
                "Abb.": [{"F": "Abb."}],
                "abzgl.": [{"F": "abzgl."}],
                "Hbf.": [{"F": "Hbf."}],
                "Bhf.": [{"F": "Bhf."}],
                "Bf.": [{"F": "Bf."}],
                "i.V.": [{"F": "i.V."}],
                "inkl.": [{"F": "inkl."}],
                "insb.": [{"F": "insb."}],
                "z.B.": [{"F": "z.B."}],
                "i.Tr.": [{"F": "i.Tr."}],
                "Jhd.": [{"F": "Jhd."}],
                "jur.": [{"F": "jur."}],
                "lt.": [{"F": "lt."}],
                "nat.": [{"F": "nat."}],
                "u.a.": [{"F": "u.a."}],
                "u.s.w.": [{"F": "u.s.w."}],
                "Nr.": [{"F": "Nr."}],
                "Univ.": [{"F": "Univ."}],
                "vgl.": [{"F": "vgl."}],
                "zzgl.": [{"F": "zzgl."}],
                "z.Z.": [{"F": "z.Z."}],
                "betr.": [{"F": "betr."}],
                "ehem.": [{"F": "ehem."}],
                # popular latin abbreviations
                "vs.": [{"F": "vs."}],
                "adv.": [{"F": "adv."}],
                "Chr.": [{"F": "Chr."}],
                "A.C.": [{"F": "A.C."}],
                "A.D.": [{"F": "A.D."}],
                "e.g.": [{"F": "e.g."}],
                "i.e.": [{"F": "i.e."}],
                "al.": [{"F": "al."}],
                "p.a.": [{"F": "p.a."}],
                "P.S.": [{"F": "P.S."}],
                "q.e.d.": [{"F": "q.e.d."}],
                "R.I.P.": [{"F": "R.I.P."}],
                "etc.": [{"F": "etc."}],
                "incl.": [{"F": "incl."}],
                # popular english abbreviations
                "D.C.": [{"F": "D.C."}],
                "N.Y.": [{"F": "N.Y."}],
                "N.Y.C.": [{"F": "N.Y.C."}],
                # dates
                "Jan.": [{"F": "Jan."}],
                "Feb.": [{"F": "Feb."}],
                "Mrz.": [{"F": "Mrz."}],
                "Mär.": [{"F": "Mär."}],
                "Apr.": [{"F": "Apr."}],
                "Jun.": [{"F": "Jun."}],
                "Jul.": [{"F": "Jul."}],
                "Aug.": [{"F": "Aug."}],
                "Sep.": [{"F": "Sep."}],
                "Sept.": [{"F": "Sept."}],
                "Okt.": [{"F": "Okt."}],
                "Nov.": [{"F": "Nov."}],
                "Dez.": [{"F": "Dez."}],
                "Mo.": [{"F": "Mo."}],
                "Di.": [{"F": "Di."}],
                "Mi.": [{"F": "Mi."}],
                "Do.": [{"F": "Do."}],
                "Fr.": [{"F": "Fr."}],
                "Sa.": [{"F": "Sa."}],
                "So.": [{"F": "So."}],
                # smileys
                ":)":    [{"F": ":)"}],
                "<3":    [{"F": "<3"}],
                ";)":    [{"F": ";)"}],
                "(:":    [{"F": "(:"}],
                ":(":    [{"F": ":("}],
                "-_-":   [{"F": "-_-"}],
                "=)":    [{"F": "=)"}],
                ":/":    [{"F": ":/"}],
                ":>":    [{"F": ":>"}],
                ";-)":   [{"F": ";-)"}],
                ":Y":    [{"F": ":Y"}],
                ":P":    [{"F": ":P"}],
                ":-P":   [{"F": ":-P"}],
                ":3":    [{"F": ":3"}],
                "=3":    [{"F": "=3"}],
                "xD":    [{"F": "xD"}],
                "^_^":   [{"F": "^_^"}],
                "=]":    [{"F": "=]"}],
                "=D":    [{"F": "=D"}],
                "<333":  [{"F": "<333"}],
                ":))":   [{"F": ":))"}],
                ":0":    [{"F": ":0"}],
                "-__-":  [{"F": "-__-"}],
                "xDD":   [{"F": "xDD"}],
                "o_o":   [{"F": "o_o"}],
                "o_O":   [{"F": "o_O"}],
                "V_V":   [{"F": "V_V"}],
                "=[[":   [{"F": "=[["}],
                "<33":   [{"F": "<33"}],
                ";p":    [{"F": ";p"}],
                ";D":    [{"F": ";D"}],
                ";-p":   [{"F": ";-p"}],
                ";(":    [{"F": ";("}],
                ":p":    [{"F": ":p"}],
                ":]":    [{"F": ":]"}],
                ":O":    [{"F": ":O"}],
                ":-/":   [{"F": ":-/"}],
                ":-)":   [{"F": ":-)"}],
                ":(((":  [{"F": ":((("}],
                ":((":   [{"F": ":(("}],
                ":')":   [{"F": ":')"}],
                "(^_^)": [{"F": "(^_^)"}],
                "(=":    [{"F": "(="}],
                "o.O":   [{"F": "o.O"}],
                "a.": [{"F": "a."}],
                "b.": [{"F": "b."}],
                "c.": [{"F": "c."}],
                "d.": [{"F": "d."}],
                "e.": [{"F": "e."}],
                "f.": [{"F": "f."}],
                "g.": [{"F": "g."}],
                "h.": [{"F": "h."}],
                "i.": [{"F": "i."}],
                "j.": [{"F": "j."}],
                "k.": [{"F": "k."}],
                "l.": [{"F": "l."}],
                "m.": [{"F": "m."}],
                "n.": [{"F": "n."}],
                "o.": [{"F": "o."}],
                "p.": [{"F": "p."}],
                "q.": [{"F": "q."}],
                "r.": [{"F": "r."}],
                "s.": [{"F": "s."}],
                "t.": [{"F": "t."}],
                "u.": [{"F": "u."}],
                "v.": [{"F": "v."}],
                "w.": [{"F": "w."}],
                "x.": [{"F": "x."}],
                "y.": [{"F": "y."}],
                "z.": [{"F": "z."}],
 }
 def get_double_contractions(ending):
    endings = []
    ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])
    while ends_with_contraction:
        for contraction in contractions:
            if ending.endswith(contraction):
                endings.append(contraction)
                ending = ending.rstrip(contraction)
        ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])
    endings.reverse() # reverse because the last ending is put in the list first
    return endings
 def get_token_properties(token, capitalize=False, remove_contractions=False):
    props = dict(token_properties.get(token)) # ensure we copy the dict so we can add the "F" prop
    if capitalize:
        token = token.capitalize()
    if remove_contractions:
        token = token.replace("'", "")
    props["F"] = token
    return props
 def create_entry(token, endings, capitalize=False, remove_contractions=False):
    properties = []
    properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))
    for e in endings:
        properties.append(get_token_properties(e, remove_contractions=remove_contractions))
    return properties
 FIELDNAMES = ['F','L','pos']
 def read_hardcoded(stream):
    hc_specials = {}
    for line in stream:
        line = line.strip()
        if line.startswith('#') or not line:
            continue
        key,_,rest = line.partition('\t')
        values = []
        for annotation in zip(*[ e.split('|') for e in rest.split('\t') ]):
            values.append({ k:v for k,v in itertools.izip_longest(FIELDNAMES,annotation) if v })
        hc_specials[key] = values
    return hc_specials
 def generate_specials():
    specials = {}
    for token in starting_tokens:
        possible_endings = starting_tokens[token]
        for ending in possible_endings:
            endings = []
            if ending.count("'") > 1:
                endings.extend(get_double_contractions(ending))
            else:
                endings.append(ending)
            exceptions = possible_endings[ending]
            if "lower" not in exceptions:
                special = token + ending
                specials[special] = create_entry(token, endings)
            if "upper" not in exceptions:
                special = token.capitalize() + ending
                specials[special] = create_entry(token, endings, capitalize=True)
            if "contrLower" not in exceptions:
                special = token + ending.replace("'", "")
                specials[special] = create_entry(token, endings, remove_contractions=True)
            if "contrUpper" not in exceptions:
                special = token.capitalize() + ending.replace("'", "")
                specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)
    # add in hardcoded specials
    # changed it so it generates them from a file
    with io.open('abbrev.de.tab','r',encoding='utf8') as abbrev_:
        hc_specials = read_hardcoded(abbrev_)
    specials = dict(specials, **hc_specials)
    return specials
 if __name__ == "__main__":
    specials = generate_specials()
    with open("specials.json", "w") as f:
        json.dump(specials, f, sort_keys=True, indent=4, separators=(',', ': '))
--- a/lang_data/de/infix.txt
+++ b/lang_data/de/infix.txt
@ -1,6 +0,0 @@
 \.\.\.
 (?<=[a-z])\.(?=[A-Z])
 (?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
 (?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
 (?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
 (?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
--- a/lang_data/de/lemma_rules.json
+++ b/lang_data/de/lemma_rules.json
@ -1 +0,0 @@
 {}
--- a/lang_data/de/morphs.json
+++ b/lang_data/de/morphs.json
@ -1,71 +0,0 @@
 {
    "PRP": {
        "ich":     {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 1},
        "meiner":  {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 2},
        "mir":     {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 3},
        "mich":    {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 4},
        "du":      {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 1},
        "deiner":  {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 2},
        "dir":     {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 3},
        "dich":    {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 4},
        "er":      {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 1},
        "seiner":  {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 2},
        "ihm":     {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 3},
        "ihn":     {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 4},
        "sie":     {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 1},
        "ihrer":   {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 2},
        "ihr":     {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 3},
        "sie":     {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 4},
        "es":      {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 1},
        "seiner":  {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 2},
        "ihm":     {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 3},
        "es":      {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 4},
        "wir":     {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 1},
        "unser":   {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 2},
        "uns":     {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 3},
        "uns":     {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 4},
        "ihr":     {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 1},
        "euer":    {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 2},
        "euch":    {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 3},
        "euch":    {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 4},
        "sie":     {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 1},
        "ihrer":   {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 2},
        "ihnen":   {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 3},
        "sie":     {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 4}
    },
    "PRP$": {
        "mein":    {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 1},
        "meines":  {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 2},
        "meinem":  {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 3},
        "meinen":  {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 4},
        "dein":    {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 1},
        "deines":  {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 2},
        "deinem":  {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 3},
        "deinen":  {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 4},
        "sein":    {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 1},
        "seines":  {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 2},
        "seinem":  {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 3},
        "seinen":  {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 4},
        "ihr":     {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 1},
        "ihrer":   {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 2},
        "ihrem":   {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 3},
        "ihren":   {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 4},
        "sein":    {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 1},
        "seines":  {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 2},
        "seinem":  {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 3},
        "seinen":  {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 4},
        "unser":   {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 1},
        "unseres": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 2},
        "unserem": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 3},
        "unseren": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 4},
        "euer":    {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 1},
        "eures":   {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 2},
        "eurem":   {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 3},
        "euren":   {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 4},
        "ihr":     {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 1},
        "ihres":   {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 2},
        "ihrem":   {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 3},
        "ihren":   {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 4}
    }
 }
--- a/lang_data/de/prefix.txt
+++ b/lang_data/de/prefix.txt
@ -1,27 +0,0 @@
 ,
 "
 (
 [
 {
 *
 <
 >
 $
 £
 „
 “
 '
 ``
 `
 #
 US$
 C$
 A$
 a-
 ‘
 ....
 ...
 ‚
 »
 _
 §
--- a/lang_data/de/sample.txt
+++ b/lang_data/de/sample.txt
@ -1,3 +0,0 @@
 Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern.
 Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs.
--- a/lang_data/de/specials.json
+++ b/lang_data/de/specials.json
--- a/lang_data/de/suffix.txt
+++ b/lang_data/de/suffix.txt
@ -1,73 +0,0 @@
 ,
 \"
 \)
 \]
 \}
 \*
 \!
 \?
 %
 \$
 >
 :
 ;
 '
 ”
 “
 «
 _
 ''
 's
 'S
 ’s
 ’S
 ’
 ‘
 °
 €
 \.\.
 \.\.\.
 \.\.\.\.
 (?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
 \-\-
 ´
 (?<=[0-9])km²
 (?<=[0-9])m²
 (?<=[0-9])cm²
 (?<=[0-9])mm²
 (?<=[0-9])km³
 (?<=[0-9])m³
 (?<=[0-9])cm³
 (?<=[0-9])mm³
 (?<=[0-9])ha
 (?<=[0-9])km
 (?<=[0-9])m
 (?<=[0-9])cm
 (?<=[0-9])mm
 (?<=[0-9])µm
 (?<=[0-9])nm
 (?<=[0-9])yd
 (?<=[0-9])in
 (?<=[0-9])ft
 (?<=[0-9])kg
 (?<=[0-9])g
 (?<=[0-9])mg
 (?<=[0-9])µg
 (?<=[0-9])t
 (?<=[0-9])lb
 (?<=[0-9])oz
 (?<=[0-9])m/s
 (?<=[0-9])km/h
 (?<=[0-9])mph
 (?<=[0-9])°C
 (?<=[0-9])°K
 (?<=[0-9])°F
 (?<=[0-9])hPa
 (?<=[0-9])Pa
 (?<=[0-9])mbar
 (?<=[0-9])mb
 (?<=[0-9])T
 (?<=[0-9])G
 (?<=[0-9])M
 (?<=[0-9])K
 (?<=[0-9])kb
--- a/lang_data/de/tag_map.json
+++ b/lang_data/de/tag_map.json
@ -1,59 +0,0 @@
 {
 "$(": {"pos": "PUNCT", "PunctType": "Brck"},
 "$,": {"pos": "PUNCT", "PunctType": "Comm"},
 "$.": {"pos": "PUNCT", "PunctType": "Peri"},
 "ADJA":	{"pos": "ADJ"},
 "ADJD":	{"pos": "ADJ", "Variant": "Short"},
 "ADV":	{"pos": "ADV"},
 "APPO":	{"pos": "ADP", "AdpType": "Post"},
 "APPR":	{"pos": "ADP", "AdpType": "Prep"},
 "APPRART":	{"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
 "APZR":	{"pos": "ADP", "AdpType": "Circ"},
 "ART":	{"pos": "DET", "PronType": "Art"},
 "CARD":	{"pos": "NUM", "NumType": "Card"},
 "FM":	{"pos": "X", "Foreign": "Yes"},
 "ITJ":	{"pos": "INTJ"},
 "KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
 "KON": {"pos": "CONJ"},
 "KOUI":	{"pos": "SCONJ"},
 "KOUS":	{"pos": "SCONJ"},
 "NE": {"pos": "PROPN"},
 "NNE": {"pos": "PROPN"},
 "NN": {"pos": "NOUN"},
 "PAV": {"pos": "ADV", "PronType": "Dem"},
 "PROAV": {"pos": "ADV", "PronType": "Dem"},
 "PDAT":	{"pos": "DET", "PronType": "Dem"},
 "PDS": {"pos": "PRON", "PronType": "Dem"},
 "PIAT":	{"pos": "DET", "PronType": "Ind,Neg,Tot"},
 "PIDAT":	{"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
 "PIS":	{"pos": "PRON", "PronType": "Ind,Neg,Tot"},
 "PPER":	{"pos": "PRON", "PronType": "Prs"},
 "PPOSAT":	{"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
 "PPOSS":	{"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
 "PRELAT":	{"pos": "DET", "PronType": "Rel"},
 "PRELS":	{"pos": "PRON", "PronType": "Rel"},
 "PRF":	{"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
 "PTKA":	{"pos": "PART"},
 "PTKANT":	{"pos": "PART", "PartType": "Res"},
 "PTKNEG":	{"pos": "PART", "Negative": "Neg"},
 "PTKVZ":	{"pos": "PART", "PartType": "Vbp"},
 "PTKZU":	{"pos": "PART", "PartType": "Inf"},
 "PWAT":	{"pos": "DET", "PronType": "Int"},
 "PWAV":	{"pos": "ADV", "PronType": "Int"},
 "PWS":	{"pos": "PRON", "PronType": "Int"},
 "TRUNC":	{"pos": "X", "Hyph": "Yes"},
 "VAFIN":	{"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
 "VAIMP":	{"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
 "VAINF":	{"pos": "AUX", "VerbForm": "Inf"},
 "VAPP":	{"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
 "VMFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
 "VMINF":	{"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
 "VMPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
 "VVFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
 "VVIMP":	{"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
 "VVINF":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVIZU":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
 "XY":	{"pos": "X"},
 "SP": {"pos": "SPACE"}
 }
--- a/lang_data/en/LICENSE
+++ b/lang_data/en/LICENSE
@ -1,20 +0,0 @@
 WordNet Release 3.0 This software and database is being provided to you, the
 LICENSEE, by Princeton University under the following license. By obtaining,
 using and/or copying this software and database, you agree that you have read,
 understood, and will comply with these terms and conditions.: Permission to
 use, copy, modify and distribute this software and database and its
 documentation for any purpose and without fee or royalty is hereby granted,
 provided that you agree to comply with the following copyright notice and
 statements, including the disclaimer, and that the same appear on ALL copies of
 the software, database and documentation, including modifications that you make for internal use or for distribution. WordNet 3.0 Copyright 2006 by Princeton
 University. All rights reserved. THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS"
 AND PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
 IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON UNIVERSITY MAKES NO
 REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE OR
 DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS
 OR OTHER RIGHTS. The name of Princeton University or Princeton may not be used
 in advertising or publicity pertaining to distribution of the software and/or
 database. Title to copyright in this software, database and any associated
 documentation shall at all times remain with Princeton University and LICENSEE
 agrees to preserve same.
--- a/lang_data/en/gazetteer.json
+++ b/lang_data/en/gazetteer.json
@ -1,194 +0,0 @@
 {
 	"Reddit": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "reddit"}]
 		]
 	],
 	"SeptemberElevenAttacks": [
 		"EVENT",
 		{},
 		[
 			[
 				{"orth": "9/11"}
 			],
 			[
 				{"lower": "september"},
 				{"orth": "11"}
 			]
 		]
 	],
 	"Linux": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "linux"}]
 		]
 	],
 	"Haskell": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "haskell"}]
 		]
 	],
 	"HaskellCurry": [
 		"PERSON",
 		{},
 		[
 			[
 				{"lower": "haskell"},
 				{"lower": "curry"}
 			]
 		]
 	],
 	"Javascript": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "javascript"}]
 		]
 	],
 	"CSS": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "css"}],
 			[{"lower": "css3"}]
 		]
 	],
 	"displaCy": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "displacy"}]
 		]
 	],
 	"spaCy": [
 		"PRODUCT",
 		{},
 		[
 			[{"orth": "spaCy"}]
 		]
 	],
    "HTML": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "html"}],
 			[{"lower": "html5"}]
 		]
 	],
    "Python": [
        "PRODUCT",
        {},
        [
            [{"orth": "Python"}]
        ]
    ],
    "Ruby": [
        "PRODUCT",
        {},
        [
            [{"orth": "Ruby"}]
        ]
    ],
    "Digg": [
        "PRODUCT",
        {},
        [
            [{"lower": "digg"}]
        ]
    ],
     "FoxNews": [
        "ORG",
        {},
        [
            [{"orth": "Fox"}],
            [{"orth": "News"}]
        ]
    ],
    "Google": [
        "ORG",
        {},
        [
            [{"lower": "google"}]
        ]
    ],
    "Mac": [
        "PRODUCT",
        {},
        [
            [{"lower": "mac"}]
        ]
    ],
    "Wikipedia": [
        "PRODUCT",
        {},
        [
            [{"lower": "wikipedia"}]
        ]
    ],
    "Windows": [
        "PRODUCT",
        {},
        [
            [{"orth": "Windows"}]
        ]
    ],
     "Dell": [
        "ORG",
        {},
        [
            [{"lower": "dell"}]
        ]
    ],
    "Facebook": [
        "ORG",
        {},
        [
            [{"lower": "facebook"}]
        ]
    ],
     "Blizzard": [
        "ORG",
        {},
        [
            [{"orth": "Blizzard"}]
        ]
    ],
    "Ubuntu": [
        "ORG",
        {},
        [
            [{"orth": "Ubuntu"}]
        ]
    ],
    "Youtube": [
        "PRODUCT",
        {},
        [
            [{"lower": "youtube"}]
        ]
    ],
    "false_positives": [
        null,
        {},
        [
            [{"orth": "Shit"}],
            [{"orth": "Weed"}],
            [{"orth": "Cool"}],
            [{"orth": "Btw"}],
            [{"orth": "Bah"}],
            [{"orth": "Bullshit"}],
            [{"orth": "Lol"}],
            [{"orth": "Yo"}, {"lower": "dawg"}],
            [{"orth": "Yay"}],
            [{"orth": "Ahh"}],
            [{"orth": "Yea"}],
            [{"orth": "Bah"}]
        ]
    ]
 }
--- a/lang_data/en/generate_specials.py
+++ b/lang_data/en/generate_specials.py
@ -1,422 +0,0 @@
 # -#- coding: utf-8 -*-
 import json
 contractions = {"n't", "'nt", "not", "'ve", "'d", "'ll", "'s", "'m", "'ma", "'re"}
 # contains the lemmas, parts of speech, number, and tenspect of
 # potential tokens generated after splitting contractions off
 token_properties = { 
            "ai": {"L": "be", "pos": "VBP", "number": 2},
            "are": {"L": "be", "pos": "VBP", "number": 2},
            "ca": {"L": "can", "pos": "MD"},
            "can": {"L": "can", "pos": "MD"},
            "could": {"pos": "MD", "L": "could"},
            "'d": {"L": "would", "pos": "MD"},
            "did": {"L": "do", "pos": "VBD"},
            "do": {"L": "do"},
            "does": {"L": "do", "pos": "VBZ"},
            "had": {"L": "have", "pos": "VBD"},
            "has": {"L": "have", "pos": "VBZ"},
            "have": {"pos": "VB"},
            "he": {"L": "-PRON-", "pos": "PRP"},
            "how": {},
            "i": {"L": "-PRON-", "pos": "PRP"},
            "is": {"L": "be", "pos": "VBZ"},
            "it": {"L": "-PRON-", "pos": "PRP"},
            "'ll": {"L": "will", "pos": "MD"},
            "'m": {"L": "be", "pos": "VBP", "number": 1, "tenspect": 1},
            "'ma": {},
            "might": {},
            "must": {},
            "need": {}, 
            "not": {"L": "not", "pos": "RB"},
            "'nt": {"L": "not", "pos": "RB"},
            "n't": {"L": "not", "pos": "RB"},
            "'re": {"L": "be", "pos": "VBZ"},
            "'s": {},                                       # no POS or lemma for s?
            "sha": {"L": "shall", "pos": "MD"},
            "she": {"L": "-PRON-", "pos": "PRP"},
            "should": {},
            "that": {},
            "there": {},
            "they": {"L": "-PRON-", "pos": "PRP"},
            "was": {},
            "we": {"L": "-PRON-", "pos": "PRP"},
            "were": {},
            "what": {},
            "when": {},
            "where": {},
            "who": {},
            "why": {},
            "wo": {},
            "would": {},
            "you": {"L": "-PRON-", "pos": "PRP"},
            "'ve": {"L": "have", "pos": "VB"}
 }
 # contains starting tokens with their potential contractions
 # each potential contraction has a list of exceptions
    # lower - don't generate the lowercase version
    # upper - don't generate the uppercase version
    # contrLower - don't generate the lowercase version with apostrophe (') removed
    # contrUpper - dont' generate the uppercase version with apostrophe (') removed
 # for example, we don't want to create the word "hell" or "Hell" from "he" + "'ll" so 
 # we add "contrLower" and "contrUpper" to the exceptions list
 starting_tokens = {
                "ai": {"n't": []}, 
                "are": {"n't": []}, 
                "ca": {"n't": []},
                "can": {"not": []},
                "could": {"'ve": [], "n't": [], "n't've": []},
                "did": {"n't": []},
                "does": {"n't": []},
                "do": {"n't": []},
                "had": {"n't": [], "n't've": []},
                "has": {"n't": []},
                "have": {"n't": []},
                "he": {"'d": [], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []},
                "how": {"'d": [], "'ll": [], "'s": []},
                "i": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'m": [], "'ma": [], "'ve": []},
                "is": {"n't": []},
                "it": {"'d": [], "'d've": [], "'ll": [], "'s": ["contrLower", "contrUpper"]},
                "might": {"n't": [], "n't've": [], "'ve": []},
                "must": {"n't": [], "'ve": []},
                "need": {"n't": []},
                "not": {"'ve": []},
                "sha": {"n't": []},
                "she": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []},
                "should": {"'ve": [], "n't": [], "n't've": []},
                "that": {"'s": []},
                "there": {"'d": [], "'d've": [], "'s": ["contrLower", "contrUpper"], "'ll": []},
                "they": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []},
                "was": {"n't": []},
                "we": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'re": ["contrLower", "contrUpper"], "'ve": []},
                "were": {"n't": []},
                "what": {"'ll": [], "'re": [], "'s": [], "'ve": []},
                "when": {"'s": []},
                "where": {"'d": [], "'s": [], "'ve": []},
                "who": {"'d": [], "'ll": [], "'re": ["contrLower", "contrUpper"], "'s": [], "'ve": []},
                "why": {"'ll": [], "'re": [], "'s": []},
                "wo": {"n't": []},
                "would": {"'ve": [], "n't": [], "n't've": []},
                "you": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []}
                }
 # other specials that don't really have contractions
 # so they are hardcoded
 hardcoded_specials = {
                "let's": [{"F": "let"}, {"F": "'s", "L": "us"}],
                "Let's": [{"F": "Let"}, {"F": "'s", "L": "us"}],
                "'s":  [{"F": "'s", "L": "'s"}],
                "'S":  [{"F": "'S", "L": "'s"}],
                u"\u2018s": [{"F": u"\u2018s", "L": "'s"}],
                u"\u2018S": [{"F": u"\u2018S", "L": "'s"}],
                "'em": [{"F": "'em"}],
                "'ol": [{"F": "'ol"}],
                "vs.": [{"F": "vs."}],
                "Ms.": [{"F": "Ms."}],
                "Mr.": [{"F": "Mr."}],
                "Dr.": [{"F": "Dr."}],
                "Mrs.": [{"F": "Mrs."}],
                "Messrs.": [{"F": "Messrs."}],
                "Gov.": [{"F": "Gov."}],
                "Gen.": [{"F": "Gen."}],
                "Mt.": [{"F": "Mt.", "L": "Mount"}],
                "''": [{"F": "''"}],
                "—": [{"F": "—", "L": "--", "pos": ":"}],
                "Corp.": [{"F": "Corp."}],
                "Inc.": [{"F": "Inc."}],
                "Co.": [{"F": "Co."}],
                "co.": [{"F": "co."}],
                "Ltd.": [{"F": "Ltd."}],
                "Bros.": [{"F": "Bros."}],
                "Rep.": [{"F": "Rep."}],
                "Sen.": [{"F": "Sen."}],
                "Jr.": [{"F": "Jr."}],
                "Rev.": [{"F": "Rev."}],
                "Adm.": [{"F": "Adm."}],
                "St.": [{"F": "St."}],
                "a.m.": [{"F": "a.m."}],
                "p.m.": [{"F": "p.m."}],
                "1a.m.": [{"F": "1"}, {"F": "a.m."}],
                "2a.m.": [{"F": "2"}, {"F": "a.m."}],
                "3a.m.": [{"F": "3"}, {"F": "a.m."}],
                "4a.m.": [{"F": "4"}, {"F": "a.m."}],
                "5a.m.": [{"F": "5"}, {"F": "a.m."}],
                "6a.m.": [{"F": "6"}, {"F": "a.m."}],
                "7a.m.": [{"F": "7"}, {"F": "a.m."}],
                "8a.m.": [{"F": "8"}, {"F": "a.m."}],
                "9a.m.": [{"F": "9"}, {"F": "a.m."}],
                "10a.m.": [{"F": "10"}, {"F": "a.m."}],
                "11a.m.": [{"F": "11"}, {"F": "a.m."}],
                "12a.m.": [{"F": "12"}, {"F": "a.m."}],
                "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
                "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
                "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
                "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
                "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
                "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
                "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
                "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
                "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
                "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
                "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
                "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
                "p.m.": [{"F": "p.m."}],
                "1p.m.": [{"F": "1"}, {"F": "p.m."}],
                "2p.m.": [{"F": "2"}, {"F": "p.m."}],
                "3p.m.": [{"F": "3"}, {"F": "p.m."}],
                "4p.m.": [{"F": "4"}, {"F": "p.m."}],
                "5p.m.": [{"F": "5"}, {"F": "p.m."}],
                "6p.m.": [{"F": "6"}, {"F": "p.m."}],
                "7p.m.": [{"F": "7"}, {"F": "p.m."}],
                "8p.m.": [{"F": "8"}, {"F": "p.m."}],
                "9p.m.": [{"F": "9"}, {"F": "p.m."}],
                "10p.m.": [{"F": "10"}, {"F": "p.m."}],
                "11p.m.": [{"F": "11"}, {"F": "p.m."}],
                "12p.m.": [{"F": "12"}, {"F": "p.m."}],
                "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
                "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
                "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
                "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
                "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
                "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
                "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
                "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
                "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
                "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
                "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
                "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
                "Jan.": [{"F": "Jan."}],
                "Feb.": [{"F": "Feb."}],
                "Mar.": [{"F": "Mar."}],
                "Apr.": [{"F": "Apr."}],
                "May.": [{"F": "May."}],
                "Jun.": [{"F": "Jun."}],
                "Jul.": [{"F": "Jul."}],
                "Aug.": [{"F": "Aug."}],
                "Sep.": [{"F": "Sep."}],
                "Sept.": [{"F": "Sept."}],
                "Oct.": [{"F": "Oct."}],
                "Nov.": [{"F": "Nov."}],
                "Dec.": [{"F": "Dec."}],
                "Ala.": [{"F": "Ala."}],
                "Ariz.": [{"F": "Ariz."}],
                "Ark.": [{"F":  "Ark."}],
                "Calif.": [{"F": "Calif."}],
                "Colo.": [{"F": "Colo."}],
                "Conn.": [{"F": "Conn."}],
                "Del.": [{"F":  "Del."}],
                "D.C.": [{"F": "D.C."}],
                "Fla.": [{"F":  "Fla."}],
                "Ga.": [{"F": "Ga."}],
                "Ill.": [{"F": "Ill."}],
                "Ind.": [{"F": "Ind."}],
                "Kans.": [{"F": "Kans."}],
                "Kan.": [{"F": "Kan."}],
                "Ky.": [{"F": "Ky."}],
                "La.": [{"F": "La."}],
                "Md.": [{"F": "Md."}],
                "Mass.": [{"F": "Mass."}],
                "Mich.": [{"F": "Mich."}],
                "Minn.": [{"F": "Minn."}],
                "Miss.": [{"F": "Miss."}],
                "Mo.": [{"F": "Mo."}],
                "Mont.": [{"F": "Mont."}],
                "Nebr.": [{"F": "Nebr."}],
                "Neb.": [{"F": "Neb."}],
                "Nev.": [{"F":  "Nev."}],
                "N.H.": [{"F": "N.H."}],
                "N.J.": [{"F": "N.J."}],
                "N.M.": [{"F": "N.M."}],
                "N.Y.": [{"F": "N.Y."}],
                "N.C.": [{"F": "N.C."}],
                "N.D.": [{"F": "N.D."}],
                "Okla.": [{"F": "Okla."}],
                "Ore.": [{"F": "Ore."}],
                "Pa.": [{"F": "Pa."}],
                "Tenn.": [{"F": "Tenn."}],
                "Va.": [{"F": "Va."}],
                "Wash.": [{"F": "Wash."}],
                "Wis.": [{"F": "Wis."}],
                ":)":  [{"F": ":)"}],
                "<3":  [{"F": "<3"}],
                ";)":  [{"F": ";)"}],
                "(:":  [{"F": "(:"}],
                ":(":  [{"F": ":("}],
                "-_-": [{"F": "-_-"}],
                "=)":  [{"F": "=)"}],
                ":/":  [{"F": ":/"}],
                ":>":  [{"F": ":>"}],
                ";-)": [{"F": ";-)"}],
                ":Y":  [{"F": ":Y"}],
                ":P":  [{"F": ":P"}],
                ":-P": [{"F": ":-P"}],
                ":3":  [{"F": ":3"}],
                "=3":  [{"F": "=3"}],
                "xD":  [{"F": "xD"}],
                "^_^": [{"F": "^_^"}],
                "=]":  [{"F": "=]"}],
                "=D":  [{"F": "=D"}],
                "<333":    [{"F": "<333"}],
                ":))": [{"F": ":))"}],
                ":0":  [{"F": ":0"}],
                "-__-":    [{"F": "-__-"}],
                "xDD": [{"F": "xDD"}],
                "o_o": [{"F": "o_o"}],
                "o_O": [{"F": "o_O"}],
                "V_V": [{"F": "V_V"}],
                "=[[": [{"F": "=[["}],
                "<33": [{"F": "<33"}],
                ";p":  [{"F": ";p"}],
                ";D":  [{"F": ";D"}],
                ";-p": [{"F": ";-p"}],
                ";(":  [{"F": ";("}],
                ":p":  [{"F": ":p"}],
                ":]":  [{"F": ":]"}],
                ":O":  [{"F": ":O"}],
                ":-/": [{"F": ":-/"}],
                ":-)": [{"F": ":-)"}],
                ":(((":    [{"F": ":((("}],
                ":((": [{"F": ":(("}],
                ":')": [{"F": ":')"}],
                "(^_^)":   [{"F": "(^_^)"}],
                "(=":  [{"F": "(="}],
                "o.O": [{"F": "o.O"}],
                "\")": [{"F": "\")"}],
                "a.": [{"F": "a."}],
                "b.": [{"F": "b."}],
                "c.": [{"F": "c."}],
                "d.": [{"F": "d."}],
                "e.": [{"F": "e."}],
                "f.": [{"F": "f."}],
                "g.": [{"F": "g."}],
                "h.": [{"F": "h."}],
                "i.": [{"F": "i."}],
                "j.": [{"F": "j."}],
                "k.": [{"F": "k."}],
                "l.": [{"F": "l."}],
                "m.": [{"F": "m."}],
                "n.": [{"F": "n."}],
                "o.": [{"F": "o."}],
                "p.": [{"F": "p."}],
                "q.": [{"F": "q."}],
                "r.": [{"F": "r."}],
                "s.": [{"F": "s."}],
                "t.": [{"F": "t."}],
                "u.": [{"F": "u."}],
                "v.": [{"F": "v."}],
                "w.": [{"F": "w."}],
                "x.": [{"F": "x."}],
                "y.": [{"F": "y."}],
                "z.": [{"F": "z."}],
                "i.e.": [{"F": "i.e."}],
                "I.e.": [{"F": "I.e."}],
                "I.E.": [{"F": "I.E."}],
                "e.g.": [{"F": "e.g."}],
                "E.g.": [{"F": "E.g."}],
                "E.G.": [{"F": "E.G."}],
                "\n": [{"F": "\n", "pos": "SP"}],
                "\t": [{"F": "\t", "pos": "SP"}],
                " ": [{"F": " ", "pos": "SP"}],
                u"\u00a0": [{"F": u"\u00a0", "pos": "SP", "L": "  "}]
 }
 def get_double_contractions(ending):
    endings = []
    ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])
    while ends_with_contraction:
        for contraction in contractions:
            if ending.endswith(contraction):
                endings.append(contraction)
                ending = ending.rstrip(contraction)
        ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])
    endings.reverse() # reverse because the last ending is put in the list first
    return endings
 def get_token_properties(token, capitalize=False, remove_contractions=False):
    props = dict(token_properties.get(token)) # ensure we copy the dict so we can add the "F" prop
    if capitalize:
        token = token.capitalize()
    if remove_contractions:
        token = token.replace("'", "")
    props["F"] = token
    return props
 def create_entry(token, endings, capitalize=False, remove_contractions=False):
    properties = []
    properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))
    for e in endings:
        properties.append(get_token_properties(e, remove_contractions=remove_contractions))
    return properties
 def generate_specials():
    specials = {}
    for token in starting_tokens:
        possible_endings = starting_tokens[token]
        for ending in possible_endings:
            endings = []
            if ending.count("'") > 1:
                endings.extend(get_double_contractions(ending))
            else:
                endings.append(ending)
            exceptions = possible_endings[ending]
            if "lower" not in exceptions:
                special = token + ending
                specials[special] = create_entry(token, endings)
            if "upper" not in exceptions:
                special = token.capitalize() + ending
                specials[special] = create_entry(token, endings, capitalize=True)
            if "contrLower" not in exceptions:
                special = token + ending.replace("'", "")
                specials[special] = create_entry(token, endings, remove_contractions=True)
            if "contrUpper" not in exceptions:
                special = token.capitalize() + ending.replace("'", "")
                specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)
    # add in hardcoded specials
    specials = dict(specials, **hardcoded_specials)
    return specials
 if __name__ == "__main__":
    specials = generate_specials()
    with open("specials.json", "w") as file_:
        file_.write(json.dumps(specials, indent=2))
--- a/lang_data/en/infix.txt
+++ b/lang_data/en/infix.txt
@ -1,6 +0,0 @@
 \.\.\.+
 (?<=[a-z])\.(?=[A-Z])
 (?<=[a-zA-Z])-(?=[a-zA-z])
 (?<=[a-zA-Z])--(?=[a-zA-z])
 (?<=[0-9])-(?=[0-9])
 (?<=[A-Za-z]),(?=[A-Za-z])
--- a/lang_data/en/morphs.json
+++ b/lang_data/en/morphs.json
@ -1,59 +0,0 @@
 {
    "PRP": {
        "I":          {"L": "-PRON-", "PronType": "Prs", "Person": "One",   "Number": "Sing",                   "Case": "Nom"},
        "me":         {"L": "-PRON-", "PronType": "Prs", "Person": "One",   "Number": "Sing",                   "Case": "Acc"},
        "you":        {"L": "-PRON-", "PronType": "Prs", "Person": "Two"},
        "he":         {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Nom"},
        "him":        {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Acc"},
        "she":        {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem",  "Case": "Nom"},
        "her":        {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem",  "Case": "Acc"},
        "it":         {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"},
        "we":         {"L": "-PRON-", "PronType": "Prs", "Person": "One",   "Number": "Plur",                   "Case": "Nom"},
        "us":         {"L": "-PRON-", "PronType": "Prs", "Person": "One",   "Number": "Plur",                   "Case": "Acc"},
        "they":       {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Plur",                   "Case": "Nom"},
        "them":       {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Plur",                   "Case": "Acc"},
        "mine":       {"L": "-PRON-", "PronType": "Prs", "Person": "One",   "Number": "Sing",                   "Poss": "Yes", "Reflex": "Yes"},
        "yours":      {"L": "-PRON-", "PronType": "Prs", "Person": "Two",                                       "Poss": "Yes", "Reflex": "Yes"},
        "his":        {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"},
        "hers":       {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem",  "Poss": "Yes", "Reflex": "Yes"},
        "its":        {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"},
        "ours":       {"L": "-PRON-", "PronType": "Prs", "Person": "One",   "Number": "Plur",                   "Poss": "Yes", "Reflex": "Yes"},
        "yours":      {"L": "-PRON-", "PronType": "Prs", "Person": "Two",   "Number": "Plur",                   "Poss": "Yes", "Reflex": "Yes"},
        "theirs":     {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Plur",                   "Poss": "Yes", "Reflex": "Yes"},
        "myself":     {"L": "-PRON-", "PronType": "Prs", "Person": "One",   "Number": "Sing",  "Case": "Acc",                  "Reflex": "Yes"},
        "yourself":   {"L": "-PRON-", "PronType": "Prs", "Person": "Two",                     "Case": "Acc",                   "Reflex": "Yes"},
        "himself":    {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Masc", "Reflex": "Yes"},
        "herself":    {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Fem",  "Reflex": "Yes"},
        "itself":     {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Neut", "Reflex": "Yes"},
        "themself":   {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc",                   "Reflex": "Yes"},
        "ourselves":  {"L": "-PRON-", "PronType": "Prs", "Person": "One",   "Number": "Plur", "Case": "Acc",                   "Reflex": "Yes"},
        "yourselves": {"L": "-PRON-", "PronType": "Prs", "Person": "Two",                     "Case": "Acc",                   "Reflex": "Yes"},
        "themselves": {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc",                   "Reflex": "Yes"}
    },
    "PRP$": {
        "my":    {"L": "-PRON-", "Person": "One",   "Number": "Sing",                   "PronType": "Prs", "Poss": "Yes"},
        "your":  {"L": "-PRON-", "Person": "Two",                                       "PronType": "Prs", "Poss": "Yes"},
        "his":   {"L": "-PRON-", "Person": "Three", "Number": "Sing", "Gender": "Masc", "PronType": "Prs", "Poss": "Yes"},
        "her":   {"L": "-PRON-", "Person": "Three", "Number": "Sing", "Gender": "Fem",  "PronType": "Prs", "Poss": "Yes"},
        "its":   {"L": "-PRON-", "Person": "Three", "Number": "Sing", "Gender": "Neut", "PronType": "Prs", "Poss": "Yes"},
        "our":   {"L": "-PRON-", "Person": "One",   "Number": "Plur",                   "PronType": "Prs", "Poss": "Yes"},
        "their": {"L": "-PRON-", "Person": "Three", "Number": "Plur",                   "PronType": "Prs", "Poss": "Yes"}
    },
    "VBZ": {
        "am":  {"L": "be", "VerbForm": "Fin", "Person": "One",   "Tense": "Pres", "Mood": "Ind"},
        "are": {"L": "be", "VerbForm": "Fin", "Person": "Two",   "Tense": "Pres", "Mood": "Ind"},
        "is":  {"L": "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"},
    },
    "VBP": {
        "are":  {"L": "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
    },
    "VBD": {
        "was":  {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
        "were": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}
    }
 }
--- a/lang_data/en/prefix.txt
+++ b/lang_data/en/prefix.txt
@ -1,21 +0,0 @@
 ,
 "
 (
 [
 {
 *
 <
 $
 £
 “
 '
 ``
 `
 #
 US$
 C$
 A$
 a-
 ‘
 ....
 ...
--- a/lang_data/en/specials.json
+++ b/lang_data/en/specials.json
--- a/lang_data/en/suffix.txt
+++ b/lang_data/en/suffix.txt
@ -1,26 +0,0 @@
 ,
 \"
 \)
 \]
 \}
 \*
 \!
 \?
 %
 \$
 >
 :
 ;
 '
 ”
 ''
 's
 'S
 ’s
 ’S
 ’
 \.\.
 \.\.\.
 \.\.\.\.
 (?<=[a-z0-9)\]"'%\)])\.
 (?<=[0-9])km
--- a/lang_data/en/tag_map.json
+++ b/lang_data/en/tag_map.json
@ -1,60 +0,0 @@
 {
 ".": {"pos": "punct", "puncttype": "peri"},
 ",": {"pos": "punct", "puncttype": "comm"},
 "-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"},
 "-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"},
 "``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"},
 "\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
 "''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
 ":": {"pos": "punct"},
 "$": {"pos": "sym", "other": {"symtype": "currency"}},
 "#": {"pos": "sym", "other": {"symtype": "numbersign"}},
 "AFX": {"pos": "adj",  "hyph": "hyph"},
 "CC": {"pos": "conj", "conjtype": "coor"},
 "CD": {"pos": "num", "numtype": "card"},
 "DT": {"pos": "det"},
 "EX": {"pos": "adv", "advtype": "ex"},
 "FW": {"pos": "x", "foreign": "foreign"},
 "HYPH": {"pos": "punct", "puncttype": "dash"},
 "IN": {"pos": "adp"},
 "JJ": {"pos": "adj", "degree": "pos"},
 "JJR": {"pos": "adj", "degree": "comp"},
 "JJS": {"pos": "adj", "degree": "sup"},
 "LS": {"pos": "punct", "numtype": "ord"},
 "MD": {"pos": "verb", "verbtype": "mod"},
 "NIL": {"pos": ""},
 "NN": {"pos": "noun", "number": "sing"},
 "NNP": {"pos": "propn", "nountype": "prop", "number": "sing"},
 "NNPS": {"pos": "propn", "nountype": "prop", "number": "plur"},
 "NNS": {"pos": "noun", "number": "plur"},
 "PDT": {"pos": "adj", "adjtype": "pdt", "prontype": "prn"},
 "POS": {"pos": "part", "poss": "poss"},
 "PRP": {"pos": "pron", "prontype": "prs"},
 "PRP$": {"pos": "adj", "prontype": "prs", "poss": "poss"},
 "RB": {"pos": "adv", "degree": "pos"},
 "RBR": {"pos": "adv", "degree": "comp"},
 "RBS": {"pos": "adv", "degree": "sup"},
 "RP": {"pos": "part"},
 "SYM": {"pos": "sym"},
 "TO": {"pos": "part", "parttype": "inf", "verbform": "inf"},
 "UH": {"pos": "intJ"},
 "VB": {"pos": "verb", "verbform": "inf"},
 "VBD": {"pos": "verb", "verbform": "fin", "tense": "past"},
 "VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"},
 "VBN": {"pos": "verb", "verbform": "part", "tense": "past", "aspect": "perf"},
 "VBP": {"pos": "verb", "verbform": "fin", "tense": "pres"},
 "VBZ": {"pos": "verb", "verbform": "fin", "tense": "pres", "number": "sing", "person": 3},
 "WDT": {"pos": "adj", "prontype": "int|rel"},
 "WP": {"pos": "noun", "prontype": "int|rel"},
 "WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"},
 "WRB": {"pos": "adv", "prontype": "int|rel"},
 "SP": {"pos": "space"},
 "ADD": {"pos": "x"},
 "NFP": {"pos": "punct"},
 "GW": {"pos": "x"},
 "AFX": {"pos": "x"},
 "HYPH": {"pos": "punct"},
 "XX": {"pos": "x"},
 "BES": {"pos": "verb"},
 "HVS": {"pos": "verb"}
 }
--- a/lang_data/fi/infix.txt
+++ b/lang_data/fi/infix.txt
@ -1,3 +0,0 @@
 \.\.\.
 (?<=[a-z])\.(?=[A-Z])
 (?<=[a-zA-Z])-(?=[a-zA-z])
--- a/lang_data/fi/lemma_rules.json
+++ b/lang_data/fi/lemma_rules.json
@ -1 +0,0 @@
 {}
--- a/lang_data/fi/morphs.json
+++ b/lang_data/fi/morphs.json
--- a/lang_data/fi/prefix.txt
+++ b/lang_data/fi/prefix.txt
@ -1,21 +0,0 @@
 ,
 "
 (
 [
 {
 *
 <
 $
 £
 “
 '
 ``
 `
 #
 US$
 C$
 A$
 a-
 ‘
 ....
 ...
--- a/lang_data/fi/sample.txt
+++ b/lang_data/fi/sample.txt
@ -1,3 +0,0 @@
 Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern.
 Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs.
--- a/lang_data/fi/specials.json
+++ b/lang_data/fi/specials.json
@ -1,149 +0,0 @@
 {
 "a.m.": [{"F": "a.m."}],
 "p.m.": [{"F": "p.m."}],
 "1a.m.": [{"F": "1"}, {"F": "a.m."}],
 "2a.m.": [{"F": "2"}, {"F": "a.m."}],
 "3a.m.": [{"F": "3"}, {"F": "a.m."}],
 "4a.m.": [{"F": "4"}, {"F": "a.m."}],
 "5a.m.": [{"F": "5"}, {"F": "a.m."}],
 "6a.m.": [{"F": "6"}, {"F": "a.m."}],
 "7a.m.": [{"F": "7"}, {"F": "a.m."}],
 "8a.m.": [{"F": "8"}, {"F": "a.m."}],
 "9a.m.": [{"F": "9"}, {"F": "a.m."}],
 "10a.m.": [{"F": "10"}, {"F": "a.m."}],
 "11a.m.": [{"F": "11"}, {"F": "a.m."}],
 "12a.m.": [{"F": "12"}, {"F": "a.m."}],
 "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
 "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
 "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
 "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
 "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
 "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
 "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
 "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
 "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
 "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
 "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
 "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
 "1p.m.": [{"F": "1"}, {"F": "p.m."}],
 "2p.m.": [{"F": "2"}, {"F": "p.m."}],
 "3p.m.": [{"F": "3"}, {"F": "p.m."}],
 "4p.m.": [{"F": "4"}, {"F": "p.m."}],
 "5p.m.": [{"F": "5"}, {"F": "p.m."}],
 "6p.m.": [{"F": "6"}, {"F": "p.m."}],
 "7p.m.": [{"F": "7"}, {"F": "p.m."}],
 "8p.m.": [{"F": "8"}, {"F": "p.m."}],
 "9p.m.": [{"F": "9"}, {"F": "p.m."}],
 "10p.m.": [{"F": "10"}, {"F": "p.m."}],
 "11p.m.": [{"F": "11"}, {"F": "p.m."}],
 "12p.m.": [{"F": "12"}, {"F": "p.m."}],
 "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
 "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
 "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
 "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
 "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
 "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
 "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
 "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
 "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
 "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
 "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
 "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
 "Jan.": [{"F": "Jan.", "L": "Januar"}],
 "Feb.": [{"F": "Feb.", "L": "Februar"}],
 "Mär.": [{"F": "Mär.", "L": "März"}],
 "Apr.": [{"F": "Apr.", "L": "April"}],
 "Mai.": [{"F": "Mai.", "L": "Mai"}],
 "Jun.": [{"F": "Jun.", "L": "Juni"}],
 "Jul.": [{"F": "Jul.", "L": "Juli"}],
 "Aug.": [{"F": "Aug.", "L": "August"}],
 "Sep.": [{"F": "Sep.", "L": "September"}],
 "Sept.": [{"F": "Sept.", "L": "September"}],
 "Okt.": [{"F": "Okt.", "L": "Oktober"}],
 "Nov.": [{"F": "Nov.", "L": "November"}],
 "Dez.": [{"F": "Dez.", "L": "Dezember"}],
 ":)":  [{"F": ":)"}],
 "<3":  [{"F": "<3"}],
 ";)":  [{"F": ";)"}],
 "(:":  [{"F": "(:"}],
 ":(":  [{"F": ":("}],
 "-_-": [{"F": "-_-"}],
 "=)":  [{"F": "=)"}],
 ":/":  [{"F": ":/"}],
 ":>":  [{"F": ":>"}],
 ";-)": [{"F": ";-)"}],
 ":Y":  [{"F": ":Y"}],
 ":P":  [{"F": ":P"}],
 ":-P": [{"F": ":-P"}],
 ":3":  [{"F": ":3"}],
 "=3":  [{"F": "=3"}],
 "xD":  [{"F": "xD"}],
 "^_^": [{"F": "^_^"}],
 "=]":  [{"F": "=]"}],
 "=D":  [{"F": "=D"}],
 "<333":    [{"F": "<333"}],
 ":))": [{"F": ":))"}],
 ":0":  [{"F": ":0"}],
 "-__-":    [{"F": "-__-"}],
 "xDD": [{"F": "xDD"}],
 "o_o": [{"F": "o_o"}],
 "o_O": [{"F": "o_O"}],
 "V_V": [{"F": "V_V"}],
 "=[[": [{"F": "=[["}],
 "<33": [{"F": "<33"}],
 ";p":  [{"F": ";p"}],
 ";D":  [{"F": ";D"}],
 ";-p": [{"F": ";-p"}],
 ";(":  [{"F": ";("}],
 ":p":  [{"F": ":p"}],
 ":]":  [{"F": ":]"}],
 ":O":  [{"F": ":O"}],
 ":-/": [{"F": ":-/"}],
 ":-)": [{"F": ":-)"}],
 ":(((":    [{"F": ":((("}],
 ":((": [{"F": ":(("}],
 ":')": [{"F": ":')"}],
 "(^_^)":   [{"F": "(^_^)"}],
 "(=":  [{"F": "(="}],
 "o.O": [{"F": "o.O"}],
 "\")": [{"F": "\")"}],
 "a.": [{"F": "a."}],
 "b.": [{"F": "b."}],
 "c.": [{"F": "c."}],
 "d.": [{"F": "d."}],
 "e.": [{"F": "e."}],
 "f.": [{"F": "f."}],
 "g.": [{"F": "g."}],
 "h.": [{"F": "h."}],
 "i.": [{"F": "i."}],
 "j.": [{"F": "j."}],
 "k.": [{"F": "k."}],
 "l.": [{"F": "l."}],
 "m.": [{"F": "m."}],
 "n.": [{"F": "n."}],
 "o.": [{"F": "o."}],
 "p.": [{"F": "p."}],
 "q.": [{"F": "q."}],
 "s.": [{"F": "s."}],
 "t.": [{"F": "t."}],
 "u.": [{"F": "u."}],
 "v.": [{"F": "v."}],
 "w.": [{"F": "w."}],
 "x.": [{"F": "x."}],
 "y.": [{"F": "y."}],
 "z.": [{"F": "z."}],
 "z.b.": [{"F": "z.b."}],
 "e.h.": [{"F": "I.e."}],
 "o.ä.": [{"F": "I.E."}],
 "bzw.": [{"F": "bzw."}],
 "usw.": [{"F": "usw."}],
 "\n": [{"F": "\n", "pos": "SP"}],
 "\t": [{"F": "\t", "pos": "SP"}],
 " ": [{"F": " ", "pos": "SP"}]
 }
--- a/lang_data/fi/suffix.txt
+++ b/lang_data/fi/suffix.txt
@ -1,26 +0,0 @@
 ,
 \"
 \)
 \]
 \}
 \*
 \!
 \?
 %
 \$
 >
 :
 ;
 '
 ”
 ''
 's
 'S
 ’s
 ’S
 ’
 \.\.
 \.\.\.
 \.\.\.\.
 (?<=[a-z0-9)\]"'%\)])\.
 (?<=[0-9])km
--- a/lang_data/fi/tag_map.json
+++ b/lang_data/fi/tag_map.json
@ -1,19 +0,0 @@
 {
    "NOUN": {"pos": "NOUN"},
    "VERB": {"pos": "VERB"},
    "PUNCT": {"pos": "PUNCT"},
    "ADV": {"pos": "ADV"},
    "ADJ": {"pos": "ADJ"},
    "PRON": {"pos": "PRON"},
    "PROPN": {"pos": "PROPN"},
    "CONJ": {"pos": "CONJ"},
    "NUM": {"pos": "NUM"},
    "AUX": {"pos": "AUX"},
    "SCONJ": {"pos": "SCONJ"},
    "ADP": {"pos": "ADP"},
    "SYM": {"pos": "SYM"},
    "X": {"pos": "X"},
    "INTJ": {"pos": "INTJ"},
    "DET": {"pos": "DET"},
    "PART": {"pos": "PART"}
 }
--- a/lang_data/it/infix.txt
+++ b/lang_data/it/infix.txt
@ -1,3 +0,0 @@
 \.\.\.
 (?<=[a-z])\.(?=[A-Z])
 (?<=[a-zA-Z])-(?=[a-zA-z])
--- a/lang_data/it/morphs.json
+++ b/lang_data/it/morphs.json
--- a/lang_data/it/prefix.txt
+++ b/lang_data/it/prefix.txt
@ -1,21 +0,0 @@
 ,
 "
 (
 [
 {
 *
 <
 $
 £
 “
 '
 ``
 `
 #
 US$
 C$
 A$
 a-
 ‘
 ....
 ...
--- a/lang_data/it/specials.json
+++ b/lang_data/it/specials.json
@ -1,149 +0,0 @@
 {
 "a.m.": [{"F": "a.m."}],
 "p.m.": [{"F": "p.m."}],
 "1a.m.": [{"F": "1"}, {"F": "a.m."}],
 "2a.m.": [{"F": "2"}, {"F": "a.m."}],
 "3a.m.": [{"F": "3"}, {"F": "a.m."}],
 "4a.m.": [{"F": "4"}, {"F": "a.m."}],
 "5a.m.": [{"F": "5"}, {"F": "a.m."}],
 "6a.m.": [{"F": "6"}, {"F": "a.m."}],
 "7a.m.": [{"F": "7"}, {"F": "a.m."}],
 "8a.m.": [{"F": "8"}, {"F": "a.m."}],
 "9a.m.": [{"F": "9"}, {"F": "a.m."}],
 "10a.m.": [{"F": "10"}, {"F": "a.m."}],
 "11a.m.": [{"F": "11"}, {"F": "a.m."}],
 "12a.m.": [{"F": "12"}, {"F": "a.m."}],
 "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
 "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
 "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
 "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
 "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
 "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
 "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
 "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
 "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
 "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
 "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
 "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
 "1p.m.": [{"F": "1"}, {"F": "p.m."}],
 "2p.m.": [{"F": "2"}, {"F": "p.m."}],
 "3p.m.": [{"F": "3"}, {"F": "p.m."}],
 "4p.m.": [{"F": "4"}, {"F": "p.m."}],
 "5p.m.": [{"F": "5"}, {"F": "p.m."}],
 "6p.m.": [{"F": "6"}, {"F": "p.m."}],
 "7p.m.": [{"F": "7"}, {"F": "p.m."}],
 "8p.m.": [{"F": "8"}, {"F": "p.m."}],
 "9p.m.": [{"F": "9"}, {"F": "p.m."}],
 "10p.m.": [{"F": "10"}, {"F": "p.m."}],
 "11p.m.": [{"F": "11"}, {"F": "p.m."}],
 "12p.m.": [{"F": "12"}, {"F": "p.m."}],
 "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
 "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
 "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
 "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
 "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
 "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
 "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
 "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
 "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
 "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
 "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
 "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
 "Jan.": [{"F": "Jan.", "L": "Januar"}],
 "Feb.": [{"F": "Feb.", "L": "Februar"}],
 "Mär.": [{"F": "Mär.", "L": "März"}],
 "Apr.": [{"F": "Apr.", "L": "April"}],
 "Mai.": [{"F": "Mai.", "L": "Mai"}],
 "Jun.": [{"F": "Jun.", "L": "Juni"}],
 "Jul.": [{"F": "Jul.", "L": "Juli"}],
 "Aug.": [{"F": "Aug.", "L": "August"}],
 "Sep.": [{"F": "Sep.", "L": "September"}],
 "Sept.": [{"F": "Sept.", "L": "September"}],
 "Okt.": [{"F": "Okt.", "L": "Oktober"}],
 "Nov.": [{"F": "Nov.", "L": "November"}],
 "Dez.": [{"F": "Dez.", "L": "Dezember"}],
 ":)":  [{"F": ":)"}],
 "<3":  [{"F": "<3"}],
 ";)":  [{"F": ";)"}],
 "(:":  [{"F": "(:"}],
 ":(":  [{"F": ":("}],
 "-_-": [{"F": "-_-"}],
 "=)":  [{"F": "=)"}],
 ":/":  [{"F": ":/"}],
 ":>":  [{"F": ":>"}],
 ";-)": [{"F": ";-)"}],
 ":Y":  [{"F": ":Y"}],
 ":P":  [{"F": ":P"}],
 ":-P": [{"F": ":-P"}],
 ":3":  [{"F": ":3"}],
 "=3":  [{"F": "=3"}],
 "xD":  [{"F": "xD"}],
 "^_^": [{"F": "^_^"}],
 "=]":  [{"F": "=]"}],
 "=D":  [{"F": "=D"}],
 "<333":    [{"F": "<333"}],
 ":))": [{"F": ":))"}],
 ":0":  [{"F": ":0"}],
 "-__-":    [{"F": "-__-"}],
 "xDD": [{"F": "xDD"}],
 "o_o": [{"F": "o_o"}],
 "o_O": [{"F": "o_O"}],
 "V_V": [{"F": "V_V"}],
 "=[[": [{"F": "=[["}],
 "<33": [{"F": "<33"}],
 ";p":  [{"F": ";p"}],
 ";D":  [{"F": ";D"}],
 ";-p": [{"F": ";-p"}],
 ";(":  [{"F": ";("}],
 ":p":  [{"F": ":p"}],
 ":]":  [{"F": ":]"}],
 ":O":  [{"F": ":O"}],
 ":-/": [{"F": ":-/"}],
 ":-)": [{"F": ":-)"}],
 ":(((":    [{"F": ":((("}],
 ":((": [{"F": ":(("}],
 ":')": [{"F": ":')"}],
 "(^_^)":   [{"F": "(^_^)"}],
 "(=":  [{"F": "(="}],
 "o.O": [{"F": "o.O"}],
 "\")": [{"F": "\")"}],
 "a.": [{"F": "a."}],
 "b.": [{"F": "b."}],
 "c.": [{"F": "c."}],
 "d.": [{"F": "d."}],
 "e.": [{"F": "e."}],
 "f.": [{"F": "f."}],
 "g.": [{"F": "g."}],
 "h.": [{"F": "h."}],
 "i.": [{"F": "i."}],
 "j.": [{"F": "j."}],
 "k.": [{"F": "k."}],
 "l.": [{"F": "l."}],
 "m.": [{"F": "m."}],
 "n.": [{"F": "n."}],
 "o.": [{"F": "o."}],
 "p.": [{"F": "p."}],
 "q.": [{"F": "q."}],
 "s.": [{"F": "s."}],
 "t.": [{"F": "t."}],
 "u.": [{"F": "u."}],
 "v.": [{"F": "v."}],
 "w.": [{"F": "w."}],
 "x.": [{"F": "x."}],
 "y.": [{"F": "y."}],
 "z.": [{"F": "z."}],
 "z.b.": [{"F": "z.b."}],
 "e.h.": [{"F": "I.e."}],
 "o.ä.": [{"F": "I.E."}],
 "bzw.": [{"F": "bzw."}],
 "usw.": [{"F": "usw."}],
 "\n": [{"F": "\n", "pos": "SP"}],
 "\t": [{"F": "\t", "pos": "SP"}],
 " ": [{"F": " ", "pos": "SP"}]
 }
--- a/lang_data/it/suffix.txt
+++ b/lang_data/it/suffix.txt
@ -1,26 +0,0 @@
 ,
 \"
 \)
 \]
 \}
 \*
 \!
 \?
 %
 \$
 >
 :
 ;
 '
 ”
 ''
 's
 'S
 ’s
 ’S
 ’
 \.\.
 \.\.\.
 \.\.\.\.
 (?<=[a-z0-9)\]"'%\)])\.
 (?<=[0-9])km
--- a/lang_data/it/tag_map.json
+++ b/lang_data/it/tag_map.json
@ -1,44 +0,0 @@
 {
 "S": {"pos": "NOUN"},
 "E":   {"pos": "ADP"},
 "RD":  {"pos": "DET"},
 "V":   {"pos": "VERB"},
 "_":   {"pos": "NO_TAG"},
 "A":   {"pos": "ADJ"},
 "SP":  {"pos": "PROPN"},
 "FF":  {"pos": "PUNCT"},
 "FS":  {"pos": "PUNCT"},
 "B":   {"pos": "ADV"},
 "CC":  {"pos": "CONJ"},
 "FB":  {"pos": "PUNCT"},
 "VA":  {"pos": "AUX"},
 "PC":  {"pos": "PRON"},
 "N":   {"pos": "NUM"},
 "RI":  {"pos": "DET"},
 "PR":  {"pos": "PRON"},
 "CS":  {"pos": "SCONJ"},
 "BN":  {"pos": "ADV"},
 "AP":  {"pos": "DET"},
 "VM":  {"pos": "AUX"},
 "DI":  {"pos": "DET"},
 "FC":  {"pos": "PUNCT"},
 "PI":  {"pos": "PRON"},
 "DD":  {"pos": "DET"},
 "DQ":  {"pos": "DET"},
 "PQ":  {"pos": "PRON"},
 "PD":  {"pos": "PRON"},
 "NO":  {"pos": "ADJ"},
 "PE":  {"pos": "PRON"},
 "T":   {"pos": "DET"},
 "X":   {"pos": "SYM"},
 "SW":  {"pos": "X"},
 "NO":  {"pos": "PRON"},
 "I":   {"pos": "INTJ"},
 "X":   {"pos": "X"},
 "DR":  {"pos": "DET"},
 "EA":  {"pos": "ADP"},
 "PP":  {"pos": "PRON"},
 "X":   {"pos": "NUM"},
 "DE":  {"pos": "DET"},
 "X":   {"pos": "PART"}
 }
--- a/lang_data/zh/gazetteer.json
+++ b/lang_data/zh/gazetteer.json
@ -1,194 +0,0 @@
 {
 	"Reddit": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "reddit"}]
 		]
 	],
 	"SeptemberElevenAttacks": [
 		"EVENT",
 		{},
 		[
 			[
 				{"orth": "9/11"}
 			],
 			[
 				{"lower": "september"},
 				{"orth": "11"}
 			]
 		]
 	],
 	"Linux": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "linux"}]
 		]
 	],
 	"Haskell": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "haskell"}]
 		]
 	],
 	"HaskellCurry": [
 		"PERSON",
 		{},
 		[
 			[
 				{"lower": "haskell"},
 				{"lower": "curry"}
 			]
 		]
 	],
 	"Javascript": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "javascript"}]
 		]
 	],
 	"CSS": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "css"}],
 			[{"lower": "css3"}]
 		]
 	],
 	"displaCy": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "displacy"}]
 		]
 	],
 	"spaCy": [
 		"PRODUCT",
 		{},
 		[
 			[{"orth": "spaCy"}]
 		]
 	],
    "HTML": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "html"}],
 			[{"lower": "html5"}]
 		]
 	],
    "Python": [
        "PRODUCT",
        {},
        [
            [{"orth": "Python"}]
        ]
    ],
    "Ruby": [
        "PRODUCT",
        {},
        [
            [{"orth": "Ruby"}]
        ]
    ],
    "Digg": [
        "PRODUCT",
        {},
        [
            [{"lower": "digg"}]
        ]
    ],
     "FoxNews": [
        "ORG",
        {},
        [
            [{"orth": "Fox"}],
            [{"orth": "News"}]
        ]
    ],
    "Google": [
        "ORG",
        {},
        [
            [{"lower": "google"}]
        ]
    ],
    "Mac": [
        "PRODUCT",
        {},
        [
            [{"lower": "mac"}]
        ]
    ],
    "Wikipedia": [
        "PRODUCT",
        {},
        [
            [{"lower": "wikipedia"}]
        ]
    ],
    "Windows": [
        "PRODUCT",
        {},
        [
            [{"orth": "Windows"}]
        ]
    ],
     "Dell": [
        "ORG",
        {},
        [
            [{"lower": "dell"}]
        ]
    ],
    "Facebook": [
        "ORG",
        {},
        [
            [{"lower": "facebook"}]
        ]
    ],
     "Blizzard": [
        "ORG",
        {},
        [
            [{"orth": "Blizzard"}]
        ]
    ],
    "Ubuntu": [
        "ORG",
        {},
        [
            [{"orth": "Ubuntu"}]
        ]
    ],
    "Youtube": [
        "PRODUCT",
        {},
        [
            [{"lower": "youtube"}]
        ]
    ],
    "false_positives": [
        null,
        {},
        [
            [{"orth": "Shit"}],
            [{"orth": "Weed"}],
            [{"orth": "Cool"}],
            [{"orth": "Btw"}],
            [{"orth": "Bah"}],
            [{"orth": "Bullshit"}],
            [{"orth": "Lol"}],
            [{"orth": "Yo"}, {"lower": "dawg"}],
            [{"orth": "Yay"}],
            [{"orth": "Ahh"}],
            [{"orth": "Yea"}],
            [{"orth": "Bah"}]
        ]
    ]
 }
--- a/lang_data/zh/infix.txt
+++ b/lang_data/zh/infix.txt
@ -1,6 +0,0 @@
 \.\.\.
 (?<=[a-z])\.(?=[A-Z])
 (?<=[a-zA-Z])-(?=[a-zA-z])
 (?<=[a-zA-Z])--(?=[a-zA-z])
 (?<=[0-9])-(?=[0-9])
 (?<=[A-Za-z]),(?=[A-Za-z])
--- a/lang_data/zh/morphs.json
+++ b/lang_data/zh/morphs.json
@ -1 +0,0 @@
 {}
--- a/lang_data/zh/prefix.txt
+++ b/lang_data/zh/prefix.txt
@ -1,21 +0,0 @@
 ,
 "
 (
 [
 {
 *
 <
 $
 £
 “
 '
 ``
 `
 #
 US$
 C$
 A$
 a-
 ‘
 ....
 ...
--- a/lang_data/zh/specials.json
+++ b/lang_data/zh/specials.json
@ -1 +0,0 @@
 {}
--- a/lang_data/zh/suffix.txt
+++ b/lang_data/zh/suffix.txt
@ -1,26 +0,0 @@
 ,
 \"
 \)
 \]
 \}
 \*
 \!
 \?
 %
 \$
 >
 :
 ;
 '
 ”
 ''
 's
 'S
 ’s
 ’S
 ’
 \.\.
 \.\.\.
 \.\.\.\.
 (?<=[a-z0-9)\]"'%\)])\.
 (?<=[0-9])km
--- a/lang_data/zh/tag_map.json
+++ b/lang_data/zh/tag_map.json
@ -1,43 +0,0 @@
 {
    "NR":   {"pos": "PROPN"},
    "AD":   {"pos": "ADV"},
    "NN":   {"pos": "NOUN"},
    "CD":   {"pos": "NUM"},
    "DEG":  {"pos": "PART"},
    "PN":   {"pos": "PRON"},
    "M":    {"pos": "PART"},
    "JJ":   {"pos": "ADJ"},
    "DEC":  {"pos": "PART"},
    "NT":   {"pos": "NOUN"},
    "DT":   {"pos": "DET"},
    "LC":   {"pos": "PART"},
    "CC":   {"pos": "CONJ"},
    "AS":   {"pos": "PART"},
    "SP":   {"pos": "PART"},
    "IJ":   {"pos": "INTJ"},
    "OD":   {"pos": "NUM"},
    "MSP":  {"pos": "PART"},
    "CS":   {"pos": "SCONJ"},
    "ETC":  {"pos": "PART"},
    "DEV":  {"pos": "PART"},
    "BA":   {"pos": "AUX"},
    "SB":   {"pos": "AUX"},
    "DER":  {"pos": "PART"},
    "LB":   {"pos": "AUX"},
    "P":    {"pos": "ADP"},
    "URL":  {"pos": "SYM"},
    "FRAG": {"pos": "X"},
    "X":    {"pos": "X"},
    "ON":   {"pos": "X"},
    "FW":   {"pos": "X"},
    "VC":   {"pos": "VERB"},
    "VV":   {"pos": "VERB"},
    "VA":   {"pos": "VERB"},
    "VE":   {"pos": "VERB"},
    "PU":   {"pos": "PUNCT"},
    "SP":   {"pos": "SPACE"},
    "NP":   {"pos": "X"},
    "_":    {"pos": "X"},
    "VP":   {"pos": "X"},
    "CHAR": {"pos": "X"}
 }
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -87,5 +87,3 @@ cpdef enum attr_id_t:
    PROB
    LANG
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -120,8 +120,19 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
            stringy_attrs.pop('number')
        if 'tenspect' in stringy_attrs:
            stringy_attrs.pop('tenspect')
-        #    for name, value in morphs.items():
+        morph_keys = [
-        #        stringy_attrs[name] = value
+            'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
            'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
            'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
            'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case']
        for key in morph_keys:
            if key in stringy_attrs:
                stringy_attrs.pop(key)
            elif key.lower() in stringy_attrs:
                stringy_attrs.pop(key.lower())
            elif key.upper() in stringy_attrs:
                stringy_attrs.pop(key.upper())
    for name, value in stringy_attrs.items():
        if isinstance(name, int):
            int_key = name
--- a/spacy/de/init.py
+++ b/spacy/de/init.py
@ -1,10 +1,12 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 from os import path
 from ..language import Language
 from ..attrs import LANG
-from . import language_data
+
 from .language_data import *
 class German(Language):
@ -15,13 +17,6 @@ class German(Language):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'de'
-        prefixes = tuple(language_data.TOKENIZER_PREFIXES)
+        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
-        
+        tag_map = TAG_MAP
-        suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
+        stop_words = STOP_WORDS
        infixes = tuple(language_data.TOKENIZER_INFIXES)
        tag_map = dict(language_data.TAG_MAP)
        stop_words = set(language_data.STOP_WORDS)
--- a/spacy/de/data/tokenizer/infix.txt
+++ b/spacy/de/data/tokenizer/infix.txt
@ -1,3 +0,0 @@
 \.\.\.
 (?<=[a-z])\.(?=[A-Z])
 (?<=[a-zA-Z])-(?=[a-zA-z])
--- a/spacy/de/data/tokenizer/morphs.json
+++ b/spacy/de/data/tokenizer/morphs.json
@ -1 +0,0 @@
 {}
--- a/spacy/de/data/tokenizer/prefix.txt
+++ b/spacy/de/data/tokenizer/prefix.txt
@ -1,21 +0,0 @@
 ,
 "
 (
 [
 {
 *
 <
 $
 £
 “
 '
 ``
 `
 #
 US$
 C$
 A$
 a-
 ‘
 ....
 ...
--- a/spacy/de/data/tokenizer/specials.json
+++ b/spacy/de/data/tokenizer/specials.json
@ -1 +0,0 @@
 {}
--- a/spacy/de/data/tokenizer/suffix.txt
+++ b/spacy/de/data/tokenizer/suffix.txt
@ -1,27 +0,0 @@
 ,
 \"
 \)
 \]
 \}
 \*
 \!
 \?
 %
 \$
 >
 :
 ;
 '
 ”
 ''
 's
 'S
 ’s
 ’S
 ’
 \.\.
 \.\.\.
 \.\.\.\.
 ^\d+\.$
 (?<=[a-z0-9)\]"'%\)])\.
 (?<=[0-9])km
--- a/spacy/de/data/vocab/gazetteer.json
+++ b/spacy/de/data/vocab/gazetteer.json
@ -1 +0,0 @@
 {}
--- a/spacy/de/data/vocab/lemma_rules.json
+++ b/spacy/de/data/vocab/lemma_rules.json
@ -1,31 +0,0 @@
 {
    "noun": [
        ["s", ""],
        ["ses", "s"],
        ["ves", "f"],
        ["xes", "x"],
        ["zes", "z"],
        ["ches", "ch"],
        ["shes", "sh"],
        ["men", "man"],
        ["ies", "y"]
    ],
    "verb": [
        ["s", ""],
        ["ies", "y"],
        ["es", "e"],
        ["es", ""],
        ["ed", "e"],
        ["ed", ""],
        ["ing", "e"],
        ["ing", ""]
    ],
    "adj": [
        ["er", ""],
        ["est", ""],
        ["er", "e"],
        ["est", "e"]
    ]
 }
--- a/spacy/de/data/vocab/lexemes.bin
+++ b/spacy/de/data/vocab/lexemes.bin
--- a/spacy/de/data/vocab/oov_prob
+++ b/spacy/de/data/vocab/oov_prob
@ -1 +0,0 @@
 -20.000000
--- a/spacy/de/data/vocab/strings.txt
+++ b/spacy/de/data/vocab/strings.txt
--- a/spacy/de/data/vocab/tag_map.json
+++ b/spacy/de/data/vocab/tag_map.json
@ -1,57 +0,0 @@
 {
 "$(": {"pos": "PUNCT", "PunctType": "Brck"},
 "$,": {"pos": "PUNCT", "PunctType": "Comm"},
 "$.": {"pos": "PUNCT", "PunctType": "Peri"},
 "ADJA":	{"pos": "ADJ"},
 "ADJD":	{"pos": "ADJ", "Variant": "Short"},
 "ADV":	{"pos": "ADV"},
 "APPO":	{"pos": "ADP", "AdpType": "Post"},
 "APPR":	{"pos": "ADP", "AdpType": "Prep"},
 "APPRART":	{"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
 "APZR":	{"pos": "ADP", "AdpType": "Circ"},
 "ART":	{"pos": "DET", "PronType": "Art"},
 "CARD":	{"pos": "NUM", "NumType": "Card"},
 "FM":	{"pos": "X", "Foreign": "Yes"},
 "ITJ":	{"pos": "INTJ"},
 "KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
 "KON": {"pos": "CONJ"},
 "KOUI":	{"pos": "SCONJ"},
 "KOUS":	{"pos": "SCONJ"},
 "NE": {"pos": "PROPN"},
 "NN": {"pos": "NOUN"},
 "PAV": {"pos": "ADV", "PronType": "Dem"},
 "PDAT":	{"pos": "DET", "PronType": "Dem"},
 "PDS": {"pos": "PRON", "PronType": "Dem"},
 "PIAT":	{"pos": "DET", "PronType": "Ind,Neg,Tot"},
 "PIDAT":	{"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
 "PIS":	{"pos": "PRON", "PronType": "Ind,Neg,Tot"},
 "PPER":	{"pos": "PRON", "PronType": "Prs"},
 "PPOSAT":	{"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
 "PPOSS":	{"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
 "PRELAT":	{"pos": "DET", "PronType": "Rel"},
 "PRELS":	{"pos": "PRON", "PronType": "Rel"},
 "PRF":	{"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
 "PTKA":	{"pos": "PART"},
 "PTKANT":	{"pos": "PART", "PartType": "Res"},
 "PTKNEG":	{"pos": "PART", "Negative": "Neg"},
 "PTKVZ":	{"pos": "PART", "PartType": "Vbp"},
 "PTKZU":	{"pos": "PART", "PartType": "Inf"},
 "PWAT":	{"pos": "DET", "PronType": "Int"},
 "PWAV":	{"pos": "ADV", "PronType": "Int"},
 "PWS":	{"pos": "PRON", "PronType": "Int"},
 "TRUNC":	{"pos": "X", "Hyph": "Yes"},
 "VAFIN":	{"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
 "VAIMP":	{"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
 "VAINF":	{"pos": "AUX", "VerbForm": "Inf"},
 "VAPP":	{"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
 "VMFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
 "VMINF":	{"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
 "VMPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
 "VVFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
 "VVIMP":	{"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
 "VVINF":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVIZU":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
 "XY":	{"pos": "X"},
 "SP": {"pos": "SPACE"}
 }
--- a/spacy/de/language_data.py
+++ b/spacy/de/language_data.py
--- a/spacy/de/stop_words.py
+++ b/spacy/de/stop_words.py
@ -0,0 +1,81 @@
 # encoding: utf8
 from __future__ import unicode_literals
 STOP_WORDS = set("""
 á a ab aber ach acht achte achten achter achtes ag alle allein allem allen
 aller allerdings alles allgemeinen als also am an andere anderen andern anders
 auch auf aus ausser außer ausserdem außerdem
 bald bei beide beiden beim beispiel bekannt bereits besonders besser besten bin
 bis bisher bist
 da dabei dadurch dafür dagegen daher dahin dahinter damals damit danach daneben
 dank dann daran darauf daraus darf darfst darin darüber darum darunter das
 dasein daselbst dass daß dasselbe davon davor dazu dazwischen dein deine deinem
 deiner dem dementsprechend demgegenüber demgemäss demgemäß demselben demzufolge
 den denen denn denselben der deren derjenige derjenigen dermassen dermaßen
 derselbe derselben des deshalb desselben dessen deswegen dich die diejenige
 diejenigen dies diese dieselbe dieselben diesem diesen dieser dieses dir doch
 dort drei drin dritte dritten dritter drittes du durch durchaus dürfen dürft
 durfte durften
 eben ebenso ehrlich eigen eigene eigenen eigener eigenes ein einander eine
 einem einen einer eines einigeeinigen einiger einiges einmal einmaleins elf en
 ende endlich entweder er erst erste ersten erster erstes es etwa etwas euch
 früher fünf fünfte fünften fünfter fünftes für
 gab ganz ganze ganzen ganzer ganzes gar gedurft gegen gegenüber gehabt gehen
 geht gekannt gekonnt gemacht gemocht gemusst genug gerade gern gesagt geschweige
 gewesen gewollt geworden gibt ging gleich gott gross groß grosse große grossen
 großen grosser großer grosses großes gut gute guter gutes
 habe haben habt hast hat hatte hätte hatten hätten heisst heißt her heute hier
 hin hinter hoch
 ich ihm ihn ihnen ihr ihre ihrem ihrer ihres im immer in indem infolgedessen
 ins irgend ist
 ja jahr jahre jahren je jede jedem jeden jeder jedermann jedermanns jedoch
 jemand jemandem jemanden jene jenem jenen jener jenes jetzt
 kam kann kannst kaum kein keine keinem keinen keiner kleine kleinen kleiner
 kleines kommen kommt können könnt konnte könnte konnten kurz
 lang lange leicht leider lieber los
 machen macht machte mag magst man manche manchem manchen mancher manches mehr
 mein meine meinem meinen meiner meines mensch menschen mich mir mit mittel
 mochte möchte mochten mögen möglich mögt morgen muss muß müssen musst müsst
 musste mussten
 na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter
 neuntes nicht nichts nie niemand niemandem niemanden noch nun nur
 ob oben oder offen oft ohne
 recht rechte rechten rechter rechtes richtig rund
 sagt sagte sah satt schlecht schon sechs sechste sechsten sechster sechstes
 sehr sei seid seien sein seine seinem seinen seiner seines seit seitdem selbst
 selbst sich sie sieben siebente siebenten siebenter siebentes siebte siebten
 siebter siebtes sind so solang solche solchem solchen solcher solches soll
 sollen sollte sollten sondern sonst sowie später statt
 tag tage tagen tat teil tel trotzdem tun
 über überhaupt übrigens uhr um und uns unser unsere unserer unter
 vergangene vergangenen viel viele vielem vielen vielleicht vier vierte vierten
 vierter viertes vom von vor
 wahr während währenddem währenddessen wann war wäre waren wart warum was wegen
 weil weit weiter weitere weiteren weiteres welche welchem welchen welcher
 welches wem wen wenig wenige weniger weniges wenigstens wenn wer werde werden
 werdet wessen wie wieder will willst wir wird wirklich wirst wo wohl wollen
 wollt wollte wollten worden wurde würde wurden würden
 zehn zehnte zehnten zehnter zehntes zeit zu zuerst zugleich zum zunächst zur
 zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen
 """.split())
--- a/spacy/de/tag_map.py
+++ b/spacy/de/tag_map.py
@ -0,0 +1,65 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from ..symbols import *
 TAG_MAP = {
    "$(":       {POS: PUNCT, "PunctType": "brck"},
    "$,":       {POS: PUNCT, "PunctType": "comm"},
    "$.":       {POS: PUNCT, "PunctType": "peri"},
    "ADJA":     {POS: ADJ},
    "ADJD":     {POS: ADJ, "Variant": "short"},
    "ADV":      {POS: ADV},
    "APPO":     {POS: ADP, "AdpType": "post"},
    "APPR":     {POS: ADP, "AdpType": "prep"},
    "APPRART":  {POS: ADP, "AdpType": "prep", "PronType": "art"},
    "APZR":     {POS: ADP, "AdpType": "circ"},
    "ART":      {POS: DET, "PronType": "art"},
    "CARD":     {POS: NUM, "NumType": "card"},
    "FM":       {POS: X, "Foreign": "yes"},
    "ITJ":      {POS: INTJ},
    "KOKOM":    {POS: CONJ, "ConjType": "comp"},
    "KON":      {POS: CONJ},
    "KOUI":     {POS: SCONJ},
    "KOUS":     {POS: SCONJ},
    "NE":       {POS: PROPN},
    "NNE":      {POS: PROPN},
    "NN":       {POS: NOUN},
    "PAV":      {POS: ADV, "PronType": "dem"},
    "PROAV":    {POS: ADV, "PronType": "dem"},
    "PDAT":     {POS: DET, "PronType": "dem"},
    "PDS":      {POS: PRON, "PronType": "dem"},
    "PIAT":     {POS: DET, "PronType": "ind|neg|tot"},
    "PIDAT":    {POS: DET, "AdjType": "pdt", "PronType": "ind|neg|tot"},
    "PIS":      {POS: PRON, "PronType": "ind|neg|tot"},
    "PPER":     {POS: PRON, "PronType": "prs"},
    "PPOSAT":   {POS: DET, "Poss": "yes", "PronType": "prs"},
    "PPOSS":    {POS: PRON, "Poss": "yes", "PronType": "prs"},
    "PRELAT":   {POS: DET, "PronType": "rel"},
    "PRELS":    {POS: PRON, "PronType": "rel"},
    "PRF":      {POS: PRON, "PronType": "prs", "Reflex": "yes"},
    "PTKA":     {POS: PART},
    "PTKANT":   {POS: PART, "PartType": "res"},
    "PTKNEG":   {POS: PART, "Negative": "yes"},
    "PTKVZ":    {POS: PART, "PartType": "vbp"},
    "PTKZU":    {POS: PART, "PartType": "inf"},
    "PWAT":     {POS: DET, "PronType": "int"},
    "PWAV":     {POS: ADV, "PronType": "int"},
    "PWS":      {POS: PRON, "PronType": "int"},
    "TRUNC":    {POS: X, "Hyph": "yes"},
    "VAFIN":    {POS: AUX, "Mood": "ind", "VerbForm": "fin"},
    "VAIMP":    {POS: AUX, "Mood": "imp", "VerbForm": "fin"},
    "VAINF":    {POS: AUX, "VerbForm": "inf"},
    "VAPP":     {POS: AUX, "Aspect": "perf", "VerbForm": "part"},
    "VMFIN":    {POS: VERB, "Mood": "ind", "VerbForm": "fin", "VerbType": "mod"},
    "VMINF":    {POS: VERB, "VerbForm": "inf", "VerbType": "mod"},
    "VMPP":     {POS: VERB, "Aspect": "perf", "VerbForm": "part", "VerbType": "mod"},
    "VVFIN":    {POS: VERB, "Mood": "ind", "VerbForm": "fin"},
    "VVIMP":    {POS: VERB, "Mood": "imp", "VerbForm": "fin"},
    "VVINF":    {POS: VERB, "VerbForm": "inf"},
    "VVIZU":    {POS: VERB, "VerbForm": "inf"},
    "VVPP":     {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
    "XY":       {POS: X},
    "SP":       {POS: SPACE}
 }
--- a/spacy/de/tokenizer_exceptions.py
+++ b/spacy/de/tokenizer_exceptions.py
@ -0,0 +1,629 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from ..symbols import *
 from ..language_data import PRON_LEMMA
 TOKENIZER_EXCEPTIONS = {
    "\\n": [
        {ORTH: "\\n", LEMMA: "<nl>", TAG: "SP"}
    ],
    "\\t": [
        {ORTH: "\\t", LEMMA: "<tab>", TAG: "SP"}
    ],
    "'S": [
        {ORTH: "'S", LEMMA: PRON_LEMMA}
    ],
    "'n": [
        {ORTH: "'n", LEMMA: "ein"}
    ],
    "'ne": [
        {ORTH: "'ne", LEMMA: "eine"}
    ],
    "'nen": [
        {ORTH: "'nen", LEMMA: "einen"}
    ],
    "'s": [
        {ORTH: "'s", LEMMA: PRON_LEMMA}
    ],
    "Abb.": [
        {ORTH: "Abb.", LEMMA: "Abbildung"}
    ],
    "Abk.": [
        {ORTH: "Abk.", LEMMA: "Abkürzung"}
    ],
    "Abt.": [
        {ORTH: "Abt.", LEMMA: "Abteilung"}
    ],
    "Apr.": [
        {ORTH: "Apr.", LEMMA: "April"}
    ],
    "Aug.": [
        {ORTH: "Aug.", LEMMA: "August"}
    ],
    "Bd.": [
        {ORTH: "Bd.", LEMMA: "Band"}
    ],
    "Betr.": [
        {ORTH: "Betr.", LEMMA: "Betreff"}
    ],
    "Bf.": [
        {ORTH: "Bf.", LEMMA: "Bahnhof"}
    ],
    "Bhf.": [
        {ORTH: "Bhf.", LEMMA: "Bahnhof"}
    ],
    "Bsp.": [
        {ORTH: "Bsp.", LEMMA: "Beispiel"}
    ],
    "Dez.": [
        {ORTH: "Dez.", LEMMA: "Dezember"}
    ],
    "Di.": [
        {ORTH: "Di.", LEMMA: "Dienstag"}
    ],
    "Do.": [
        {ORTH: "Do.", LEMMA: "Donnerstag"}
    ],
    "Fa.": [
        {ORTH: "Fa.", LEMMA: "Firma"}
    ],
    "Fam.": [
        {ORTH: "Fam.", LEMMA: "Familie"}
    ],
    "Feb.": [
        {ORTH: "Feb.", LEMMA: "Februar"}
    ],
    "Fr.": [
        {ORTH: "Fr.", LEMMA: "Frau"}
    ],
    "Frl.": [
        {ORTH: "Frl.", LEMMA: "Fräulein"}
    ],
    "Hbf.": [
        {ORTH: "Hbf.", LEMMA: "Hauptbahnhof"}
    ],
    "Hr.": [
        {ORTH: "Hr.", LEMMA: "Herr"}
    ],
    "Hrn.": [
        {ORTH: "Hrn.", LEMMA: "Herr"}
    ],
    "Jan.": [
        {ORTH: "Jan.", LEMMA: "Januar"}
    ],
    "Jh.": [
        {ORTH: "Jh.", LEMMA: "Jahrhundert"}
    ],
    "Jhd.": [
        {ORTH: "Jhd.", LEMMA: "Jahrhundert"}
    ],
    "Jul.": [
        {ORTH: "Jul.", LEMMA: "Juli"}
    ],
    "Jun.": [
        {ORTH: "Jun.", LEMMA: "Juni"}
    ],
    "Mi.": [
        {ORTH: "Mi.", LEMMA: "Mittwoch"}
    ],
    "Mio.": [
        {ORTH: "Mio.", LEMMA: "Million"}
    ],
    "Mo.": [
        {ORTH: "Mo.", LEMMA: "Montag"}
    ],
    "Mrd.": [
        {ORTH: "Mrd.", LEMMA: "Milliarde"}
    ],
    "Mrz.": [
        {ORTH: "Mrz.", LEMMA: "März"}
    ],
    "MwSt.": [
        {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"}
    ],
    "Mär.": [
        {ORTH: "Mär.", LEMMA: "März"}
    ],
    "Nov.": [
        {ORTH: "Nov.", LEMMA: "November"}
    ],
    "Nr.": [
        {ORTH: "Nr.", LEMMA: "Nummer"}
    ],
    "Okt.": [
        {ORTH: "Okt.", LEMMA: "Oktober"}
    ],
    "Orig.": [
        {ORTH: "Orig.", LEMMA: "Original"}
    ],
    "Pkt.": [
        {ORTH: "Pkt.", LEMMA: "Punkt"}
    ],
    "Prof.": [
        {ORTH: "Prof.", LEMMA: "Professor"}
    ],
    "Red.": [
        {ORTH: "Red.", LEMMA: "Redaktion"}
    ],
    "S'": [
        {ORTH: "S'", LEMMA: PRON_LEMMA}
    ],
    "Sa.": [
        {ORTH: "Sa.", LEMMA: "Samstag"}
    ],
    "Sep.": [
        {ORTH: "Sep.", LEMMA: "September"}
    ],
    "Sept.": [
        {ORTH: "Sept.", LEMMA: "September"}
    ],
    "So.": [
        {ORTH: "So.", LEMMA: "Sonntag"}
    ],
    "Std.": [
        {ORTH: "Std.", LEMMA: "Stunde"}
    ],
    "Str.": [
        {ORTH: "Str.", LEMMA: "Straße"}
    ],
    "Tel.": [
        {ORTH: "Tel.", LEMMA: "Telefon"}
    ],
    "Tsd.": [
        {ORTH: "Tsd.", LEMMA: "Tausend"}
    ],
    "Univ.": [
        {ORTH: "Univ.", LEMMA: "Universität"}
    ],
    "abzgl.": [
        {ORTH: "abzgl.", LEMMA: "abzüglich"}
    ],
    "allg.": [
        {ORTH: "allg.", LEMMA: "allgemein"}
    ],
    "auf'm": [
        {ORTH: "auf", LEMMA: "auf"},
        {ORTH: "'m", LEMMA: PRON_LEMMA}
    ],
    "bspw.": [
        {ORTH: "bspw.", LEMMA: "beispielsweise"}
    ],
    "bzgl.": [
        {ORTH: "bzgl.", LEMMA: "bezüglich"}
    ],
    "bzw.": [
        {ORTH: "bzw.", LEMMA: "beziehungsweise"}
    ],
    "d.h.": [
        {ORTH: "d.h.", LEMMA: "das heißt"}
    ],
    "dgl.": [
        {ORTH: "dgl.", LEMMA: "dergleichen"}
    ],
    "du's": [
        {ORTH: "du", LEMMA: PRON_LEMMA},
        {ORTH: "'s", LEMMA: PRON_LEMMA}
    ],
    "ebd.": [
        {ORTH: "ebd.", LEMMA: "ebenda"}
    ],
    "eigtl.": [
        {ORTH: "eigtl.", LEMMA: "eigentlich"}
    ],
    "engl.": [
        {ORTH: "engl.", LEMMA: "englisch"}
    ],
    "er's": [
        {ORTH: "er", LEMMA: PRON_LEMMA},
        {ORTH: "'s", LEMMA: PRON_LEMMA}
    ],
    "evtl.": [
        {ORTH: "evtl.", LEMMA: "eventuell"}
    ],
    "frz.": [
        {ORTH: "frz.", LEMMA: "französisch"}
    ],
    "gegr.": [
        {ORTH: "gegr.", LEMMA: "gegründet"}
    ],
    "ggf.": [
        {ORTH: "ggf.", LEMMA: "gegebenenfalls"}
    ],
    "ggfs.": [
        {ORTH: "ggfs.", LEMMA: "gegebenenfalls"}
    ],
    "ggü.": [
        {ORTH: "ggü.", LEMMA: "gegenüber"}
    ],
    "hinter'm": [
        {ORTH: "hinter", LEMMA: "hinter"},
        {ORTH: "'m", LEMMA: PRON_LEMMA}
    ],
    "i.O.": [
        {ORTH: "i.O.", LEMMA: "in Ordnung"}
    ],
    "i.d.R.": [
        {ORTH: "i.d.R.", LEMMA: "in der Regel"}
    ],
    "ich's": [
        {ORTH: "ich", LEMMA: PRON_LEMMA},
        {ORTH: "'s", LEMMA: PRON_LEMMA}
    ],
    "ihr's": [
        {ORTH: "ihr", LEMMA: PRON_LEMMA},
        {ORTH: "'s", LEMMA: PRON_LEMMA}
    ],
    "incl.": [
        {ORTH: "incl.", LEMMA: "inklusive"}
    ],
    "inkl.": [
        {ORTH: "inkl.", LEMMA: "inklusive"}
    ],
    "insb.": [
        {ORTH: "insb.", LEMMA: "insbesondere"}
    ],
    "kath.": [
        {ORTH: "kath.", LEMMA: "katholisch"}
    ],
    "lt.": [
        {ORTH: "lt.", LEMMA: "laut"}
    ],
    "max.": [
        {ORTH: "max.", LEMMA: "maximal"}
    ],
    "min.": [
        {ORTH: "min.", LEMMA: "minimal"}
    ],
    "mind.": [
        {ORTH: "mind.", LEMMA: "mindestens"}
    ],
    "mtl.": [
        {ORTH: "mtl.", LEMMA: "monatlich"}
    ],
    "n.Chr.": [
        {ORTH: "n.Chr.", LEMMA: "nach Christus"}
    ],
    "orig.": [
        {ORTH: "orig.", LEMMA: "original"}
    ],
    "röm.": [
        {ORTH: "röm.", LEMMA: "römisch"}
    ],
    "s'": [
        {ORTH: "s'", LEMMA: PRON_LEMMA}
    ],
    "s.o.": [
        {ORTH: "s.o.", LEMMA: "siehe oben"}
    ],
    "sie's": [
        {ORTH: "sie", LEMMA: PRON_LEMMA},
        {ORTH: "'s", LEMMA: PRON_LEMMA}
    ],
    "sog.": [
        {ORTH: "sog.", LEMMA: "so genannt"}
    ],
    "stellv.": [
        {ORTH: "stellv.", LEMMA: "stellvertretend"}
    ],
    "tägl.": [
        {ORTH: "tägl.", LEMMA: "täglich"}
    ],
    "u.U.": [
        {ORTH: "u.U.", LEMMA: "unter Umständen"}
    ],
    "u.s.w.": [
        {ORTH: "u.s.w.", LEMMA: "und so weiter"}
    ],
    "u.v.m.": [
        {ORTH: "u.v.m.", LEMMA: "und vieles mehr"}
    ],
    "unter'm": [
        {ORTH: "unter", LEMMA: "unter"},
        {ORTH: "'m", LEMMA: PRON_LEMMA}
    ],
    "usf.": [
        {ORTH: "usf.", LEMMA: "und so fort"}
    ],
    "usw.": [
        {ORTH: "usw.", LEMMA: "und so weiter"}
    ],
    "uvm.": [
        {ORTH: "uvm.", LEMMA: "und vieles mehr"}
    ],
    "v.Chr.": [
        {ORTH: "v.Chr.", LEMMA: "vor Christus"}
    ],
    "v.a.": [
        {ORTH: "v.a.", LEMMA: "vor allem"}
    ],
    "v.l.n.r.": [
        {ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"}
    ],
    "vgl.": [
        {ORTH: "vgl.", LEMMA: "vergleiche"}
    ],
    "vllt.": [
        {ORTH: "vllt.", LEMMA: "vielleicht"}
    ],
    "vlt.": [
        {ORTH: "vlt.", LEMMA: "vielleicht"}
    ],
    "vor'm": [
        {ORTH: "vor", LEMMA: "vor"},
        {ORTH: "'m", LEMMA: PRON_LEMMA}
    ],
    "wir's": [
        {ORTH: "wir", LEMMA: PRON_LEMMA},
        {ORTH: "'s", LEMMA: PRON_LEMMA}
    ],
    "z.B.": [
        {ORTH: "z.B.", LEMMA: "zum Beispiel"}
    ],
    "z.Bsp.": [
        {ORTH: "z.Bsp.", LEMMA: "zum Beispiel"}
    ],
    "z.T.": [
        {ORTH: "z.T.", LEMMA: "zum Teil"}
    ],
    "z.Z.": [
        {ORTH: "z.Z.", LEMMA: "zur Zeit"}
    ],
    "z.Zt.": [
        {ORTH: "z.Zt.", LEMMA: "zur Zeit"}
    ],
    "z.b.": [
        {ORTH: "z.b.", LEMMA: "zum Beispiel"}
    ],
    "zzgl.": [
        {ORTH: "zzgl.", LEMMA: "zuzüglich"}
    ],
    "österr.": [
        {ORTH: "österr.", LEMMA: "österreichisch"}
    ],
    "über'm": [
        {ORTH: "über", LEMMA: "über"},
        {ORTH: "'m", LEMMA: PRON_LEMMA}
    ]
 }
 ORTH_ONLY = [
    "'",
    "\\\")",
    "<space>",
    "a.",
    "ä.",
    "A.C.",
    "a.D.",
    "A.D.",
    "A.G.",
    "a.M.",
    "a.Z.",
    "Abs.",
    "adv.",
    "al.",
    "b.",
    "B.A.",
    "B.Sc.",
    "betr.",
    "biol.",
    "Biol.",
    "c.",
    "ca.",
    "Chr.",
    "Cie.",
    "co.",
    "Co.",
    "d.",
    "D.C.",
    "Dipl.-Ing.",
    "Dipl.",
    "Dr.",
    "e.",
    "e.g.",
    "e.V.",
    "ehem.",
    "entspr.",
    "erm.",
    "etc.",
    "ev.",
    "f.",
    "g.",
    "G.m.b.H.",
    "geb.",
    "Gebr.",
    "gem.",
    "h.",
    "h.c.",
    "Hg.",
    "hrsg.",
    "Hrsg.",
    "i.",
    "i.A.",
    "i.e.",
    "i.G.",
    "i.Tr.",
    "i.V.",
    "Ing.",
    "j.",
    "jr.",
    "Jr.",
    "jun.",
    "jur.",
    "k.",
    "K.O.",
    "l.",
    "L.A.",
    "lat.",
    "m.",
    "M.A.",
    "m.E.",
    "m.M.",
    "M.Sc.",
    "Mr.",
    "n.",
    "N.Y.",
    "N.Y.C.",
    "nat.",
    "ö."
    "o.",
    "o.a.",
    "o.ä.",
    "o.g.",
    "o.k.",
    "O.K.",
    "p.",
    "p.a.",
    "p.s.",
    "P.S.",
    "pers.",
    "phil.",
    "q.",
    "q.e.d.",
    "r.",
    "R.I.P.",
    "rer.",
    "s.",
    "sen.",
    "St.",
    "std.",
    "t.",
    "u.",
    "ü.",
    "u.a.",
    "U.S.",
    "U.S.A.",
    "U.S.S.",
    "v.",
    "Vol.",
    "vs.",
    "w.",
    "wiss.",
    "x.",
    "y.",
    "z.",
 ]
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -1,15 +1,16 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 from os import path
 from ..language import Language
 from . import language_data
 from .. import util
 from ..lemmatizer import Lemmatizer
 from ..vocab import Vocab
 from ..tokenizer import Tokenizer
 from ..attrs import LANG
 from .language_data import *
 class English(Language):
    lang = 'en'
@ -18,14 +19,7 @@ class English(Language):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'en'
-        tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
+        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
- 
+        tag_map = TAG_MAP
-        prefixes = tuple(language_data.TOKENIZER_PREFIXES)
+        stop_words = STOP_WORDS
-
+        lemma_rules = LEMMA_RULES
        suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
        infixes = tuple(language_data.TOKENIZER_INFIXES)
        tag_map = dict(language_data.TAG_MAP)
        stop_words = set(language_data.STOP_WORDS)
--- a/spacy/en/language_data.py
+++ b/spacy/en/language_data.py
--- a/lang_data/en/lemma_rules.json
+++ b/lang_data/en/lemma_rules.json
@ -1,4 +1,8 @@
-{
+# encoding: utf8
 from __future__ import unicode_literals
 LEMMA_RULES = {
    "noun": [
        ["s", ""],
        ["ses", "s"],
--- a/spacy/en/morph_rules.py
+++ b/spacy/en/morph_rules.py
@ -0,0 +1,67 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from ..symbols import *
 from ..language_data import PRON_LEMMA
 MORPH_RULES = {
    "PRP": {
        "I":            {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Nom"},
        "me":           {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc"},
        "you":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"},
        "he":           {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Nom"},
        "him":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Acc"},
        "she":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem",  "Case": "Nom"},
        "her":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem",  "Case": "Acc"},
        "it":           {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"},
        "we":           {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Nom"},
        "us":           {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc"},
        "they":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Nom"},
        "them":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"},
        "mine":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"},
        "yours":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"},
        "his":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"},
        "hers":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem",  "Poss": "Yes", "Reflex": "Yes"},
        "its":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"},
        "ours":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"},
        "yours":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"},
        "theirs":       {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"},
        "myself":       {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing",  "Case": "Acc", "Reflex": "Yes"},
        "yourself":     {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Case": "Acc", "Reflex": "Yes"},
        "himself":      {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Masc", "Reflex": "Yes"},
        "herself":      {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Fem",  "Reflex": "Yes"},
        "itself":       {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Neut", "Reflex": "Yes"},
        "themself":     {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Reflex": "Yes"},
        "ourselves":    {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc", "Reflex": "Yes"},
        "yourselves":   {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Case": "Acc", "Reflex": "Yes"},
        "themselves":   {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc", "Reflex": "Yes"}
    },
    "PRP$": {
        "my":           {LEMMA: PRON_LEMMA, "Person": "One", "Number": "Sing", "PronType": "Prs", "Poss": "Yes"},
        "your":         {LEMMA: PRON_LEMMA, "Person": "Two", "PronType": "Prs", "Poss": "Yes"},
        "his":          {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Masc", "PronType": "Prs", "Poss": "Yes"},
        "her":          {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Fem",  "PronType": "Prs", "Poss": "Yes"},
        "its":          {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Neut", "PronType": "Prs", "Poss": "Yes"},
        "our":          {LEMMA: PRON_LEMMA, "Person": "One", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"},
        "their":        {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"}
    },
    "VBZ": {
        "am":           {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
        "are":          {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"},
        "is":           {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"},
    },
    "VBP": {
        "are":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
    },
    "VBD": {
        "was":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
        "were":         {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}
    }
 }
--- a/spacy/en/regexes.py
+++ b/spacy/en/regexes.py
@ -1,47 +0,0 @@
 import re
 _mw_prepositions = [
    'close to',
    'down by',
    'on the way to',
    'on my way to',
    'on my way',
    'on his way to',
    'on his way',
    'on her way to',
    'on her way',
    'on your way to',
    'on your way',
    'on our way to',
    'on our way',
    'on their way to',
    'on their way',
    'along the route from'
 ]
 MW_PREPOSITIONS_RE = re.compile('|'.join(_mw_prepositions), flags=re.IGNORECASE)
 TIME_RE = re.compile(
    '{colon_digits}|{colon_digits} ?{am_pm}?|{one_two_digits} ?({am_pm})'.format(
        colon_digits=r'[0-2]?[0-9]:[0-5][0-9](?::[0-5][0-9])?',
        one_two_digits=r'[0-2]?[0-9]',
        am_pm=r'[ap]\.?m\.?'))
 DATE_RE = re.compile(
    '(?:this|last|next|the) (?:week|weekend|{days})'.format(
        days='Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday'
    ))
 MONEY_RE = re.compile('\$\d+(?:\.\d+)?|\d+ dollars(?: \d+ cents)?')
 DAYS_RE = re.compile('Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday')
 REGEXES = [('IN', 'O', MW_PREPOSITIONS_RE), ('CD', 'TIME', TIME_RE),
           ('NNP', 'DATE', DATE_RE),
           ('NNP', 'DATE', DAYS_RE), ('CD', 'MONEY', MONEY_RE)]
--- a/spacy/en/stop_words.py
+++ b/spacy/en/stop_words.py
@ -0,0 +1,67 @@
 # encoding: utf8
 from __future__ import unicode_literals
 STOP_WORDS = set("""
 a about above across after afterwards again against all almost alone along
 already also although always am among amongst amount an and another any anyhow
 anyone anything anyway anywhere are around as at
 back be became because become becomes becoming been before beforehand behind
 being below beside besides between beyond both bottom but by
 call can cannot ca could
 did do does doing done down due during
 each eight either eleven else elsewhere empty enough etc even ever every
 everyone everything everywhere except
 few fifteen fifty first five for former formerly forty four from front full
 further
 get give go
 had has have he hence her here hereafter hereby herein hereupon hers herself
 him himself his how however hundred
 i if in inc indeed into is it its itself
 keep
 last latter latterly least less
 just
 made make many may me meanwhile might mine more moreover most mostly move much
 must my myself
 name namely neither never nevertheless next nine no nobody none noone nor not
 nothing now nowhere
 of off often on once one only onto or other others otherwise our ours ourselves
 out over own
 part per perhaps please put
 quite
 rather re really regarding
 same say see seem seemed seeming seems serious several she should show side
 since six sixty so some somehow someone something sometime sometimes somewhere
 still such
 take ten than that the their them themselves then thence there thereafter
 thereby therefore therein thereupon these they third this those though three
 through throughout thru thus to together too top toward towards twelve twenty
 two
 under until up unless upon us used using
 various very very via was we well were what whatever when whence whenever where
 whereafter whereas whereby wherein whereupon wherever whether which while
 whither who whoever whole whom whose why will with within without would
 yet you your yours yourself yourselves
 """.split())
--- a/spacy/en/tag_map.py
+++ b/spacy/en/tag_map.py
@ -0,0 +1,64 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from ..symbols import *
 TAG_MAP = {
    ".":        {POS: PUNCT, "PunctType": "peri"},
    ",":        {POS: PUNCT, "PunctType": "comm"},
    "-LRB-":    {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
    "-RRB-":    {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
    "``":       {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
    "\"\"":     {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
    "''":       {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
    ":":        {POS: PUNCT},
    "$":        {POS: SYM, "Other": {"SymType": "currency"}},
    "#":        {POS: SYM, "Other": {"SymType": "numbersign"}},
    "AFX":      {POS: ADJ,  "Hyph": "yes"},
    "CC":       {POS: CONJ, "ConjType": "coor"},
    "CD":       {POS: NUM, "NumType": "card"},
    "DT":       {POS: DET},
    "EX":       {POS: ADV, "AdvType": "ex"},
    "FW":       {POS: X, "Foreign": "yes"},
    "HYPH":     {POS: PUNCT, "PunctType": "dash"},
    "IN":       {POS: ADP},
    "JJ":       {POS: ADJ, "Degree": "pos"},
    "JJR":      {POS: ADJ, "Degree": "comp"},
    "JJS":      {POS: ADJ, "Degree": "sup"},
    "LS":       {POS: PUNCT, "NumType": "ord"},
    "MD":       {POS: VERB, "VerbType": "mod"},
    "NIL":      {POS: ""},
    "NN":       {POS: NOUN, "Number": "sing"},
    "NNP":      {POS: PROPN, "NounType": "prop", "Number": "sing"},
    "NNPS":     {POS: PROPN, "NounType": "prop", "Number": "plur"},
    "NNS":      {POS: NOUN, "Number": "plur"},
    "PDT":      {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
    "POS":      {POS: PART, "Poss": "yes"},
    "PRP":      {POS: PRON, "PronType": "prs"},
    "PRP$":     {POS: ADJ, "PronType": "prs", "Poss": "yes"},
    "RB":       {POS: ADV, "Degree": "pos"},
    "RBR":      {POS: ADV, "Degree": "comp"},
    "RBS":      {POS: ADV, "Degree": "sup"},
    "RP":       {POS: PART},
    "SYM":      {POS: SYM},
    "TO":       {POS: PART, "PartType": "inf", "VerbForm": "inf"},
    "UH":       {POS: INTJ},
    "VB":       {POS: VERB, "VerbForm": "inf"},
    "VBD":      {POS: VERB, "VerbForm": "fin", "Tense": "past"},
    "VBG":      {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
    "VBN":      {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
    "VBP":      {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
    "VBZ":      {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
    "WDT":      {POS: ADJ, "PronType": "int|rel"},
    "WP":       {POS: NOUN, "PronType": "int|rel"},
    "WP$":      {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
    "WRB":      {POS: ADV, "PronType": "int|rel"},
    "SP":       {POS: SPACE},
    "ADD":      {POS: X},
    "NFP":      {POS: PUNCT},
    "GW":       {POS: X},
    "XX":       {POS: X},
    "BES":      {POS: VERB},
    "HVS":      {POS: VERB}
 }
--- a/spacy/en/tokenizer_exceptions.py
+++ b/spacy/en/tokenizer_exceptions.py
--- a/spacy/en/uget.py
+++ b/spacy/en/uget.py
@ -1,246 +0,0 @@
 import os
 import time
 import io
 import math
 import re
 try:
    from urllib.parse import urlparse
    from urllib.request import urlopen, Request
    from urllib.error import HTTPError
 except ImportError:
    from urllib2 import urlopen, urlparse, Request, HTTPError
 class UnknownContentLengthException(Exception): pass
 class InvalidChecksumException(Exception): pass
 class UnsupportedHTTPCodeException(Exception): pass
 class InvalidOffsetException(Exception): pass
 class MissingChecksumHeader(Exception): pass
 CHUNK_SIZE = 16 * 1024
 class RateSampler(object):
    def __init__(self, period=1):
        self.rate = None
        self.reset = True
        self.period = period
    def __enter__(self):
        if self.reset:
            self.reset = False
            self.start = time.time()
            self.counter = 0
    def __exit__(self, type, value, traceback):
        elapsed = time.time() - self.start
        if elapsed >= self.period:
            self.reset = True
            self.rate = float(self.counter) / elapsed
    def update(self, value):
        self.counter += value
    def format(self, unit="MB"):
        if self.rate is None:
            return None
        divisor = {'MB': 1048576, 'kB': 1024}
        return "%0.2f%s/s" % (self.rate / divisor[unit], unit)
 class TimeEstimator(object):
    def __init__(self, cooldown=1):
        self.cooldown = cooldown
        self.start = time.time()
        self.time_left = None
    def update(self, bytes_read, total_size):
        elapsed = time.time() - self.start
        if elapsed > self.cooldown:
            self.time_left = math.ceil(elapsed * total_size /
                                       bytes_read - elapsed)
    def format(self):
        if self.time_left is None:
            return None
        res = "eta "
        if self.time_left / 60 >= 1:
            res += "%dm " % (self.time_left / 60)
        return res + "%ds" % (self.time_left % 60)
 def format_bytes_read(bytes_read, unit="MB"):
    divisor = {'MB': 1048576, 'kB': 1024}
    return "%0.2f%s" % (float(bytes_read) / divisor[unit], unit)
 def format_percent(bytes_read, total_size):
    percent = round(bytes_read * 100.0 / total_size, 2)
    return "%0.2f%%" % percent
 def get_content_range(response):
    content_range = response.headers.get('Content-Range', "").strip()
    if content_range:
        m = re.match(r"bytes (\d+)-(\d+)/(\d+)", content_range)
        if m:
            return [int(v) for v in m.groups()]
 def get_content_length(response):
    if 'Content-Length' not in response.headers:
        raise UnknownContentLengthException
    return int(response.headers.get('Content-Length').strip())
 def get_url_meta(url, checksum_header=None):
    class HeadRequest(Request):
        def get_method(self):
            return "HEAD"
    r = urlopen(HeadRequest(url))
    res = {'size': get_content_length(r)}
    if checksum_header:
        value = r.headers.get(checksum_header)
        if value:
            res['checksum'] = value
    r.close()
    return res
 def progress(console, bytes_read, total_size, transfer_rate, eta):
    fields = [
        format_bytes_read(bytes_read),
        format_percent(bytes_read, total_size),
        transfer_rate.format(),
        eta.format(),
        " " * 10,
    ]
    console.write("Downloaded %s\r" % " ".join(filter(None, fields)))
    console.flush()
 def read_request(request, offset=0, console=None,
                 progress_func=None, write_func=None):
    # support partial downloads
    if offset > 0:
        request.add_header('Range', "bytes=%s-" % offset)
    try:
        response = urlopen(request)
    except HTTPError as e:
        if e.code == 416:  # Requested Range Not Satisfiable
            raise InvalidOffsetException
        # TODO add http error handling here
        raise UnsupportedHTTPCodeException(e.code)
    total_size = get_content_length(response) + offset
    bytes_read = offset
    # sanity checks
    if response.code == 200:  # OK
        assert offset == 0
    elif response.code == 206:  # Partial content
        range_start, range_end, range_total = get_content_range(response)
        assert range_start == offset
        assert range_total == total_size
        assert range_end + 1 - range_start == total_size - bytes_read
    else:
        raise UnsupportedHTTPCodeException(response.code)
    eta = TimeEstimator()
    transfer_rate = RateSampler()
    if console:
        if offset > 0:
            console.write("Continue downloading...\n")
        else:
            console.write("Downloading...\n")
    while True:
        with transfer_rate:
            chunk = response.read(CHUNK_SIZE)
            if not chunk:
                if progress_func and console:
                    console.write('\n')
                break
            bytes_read += len(chunk)
            transfer_rate.update(len(chunk))
            eta.update(bytes_read - offset, total_size - offset)
        if progress_func and console:
            progress_func(console, bytes_read, total_size, transfer_rate, eta)
        if write_func:
            write_func(chunk)
    response.close()
    assert bytes_read == total_size
    return response
 def download(url, path=".",
             checksum=None, checksum_header=None,
             headers=None, console=None):
    if os.path.isdir(path):
        path = os.path.join(path, url.rsplit('/', 1)[1])
    path = os.path.abspath(path)
    with io.open(path, "a+b") as f:
        size = f.tell()
        # update checksum of partially downloaded file
        if checksum:
            f.seek(0, os.SEEK_SET)
            for chunk in iter(lambda: f.read(CHUNK_SIZE), b""):
                checksum.update(chunk)
        def write(chunk):
            if checksum:
                checksum.update(chunk)
            f.write(chunk)
        request = Request(url)
        # request headers
        if headers:
            for key, value in headers.items():
                request.add_header(key, value)
        try:
            response = read_request(request,
                                    offset=size,
                                    console=console,
                                    progress_func=progress,
                                    write_func=write)
        except InvalidOffsetException:
            response = None
        if checksum:
            if response:
                origin_checksum = response.headers.get(checksum_header)
            else:
                # check whether file is already complete
                meta = get_url_meta(url, checksum_header)
                origin_checksum = meta.get('checksum')
            if origin_checksum is None:
                raise MissingChecksumHeader
            if checksum.hexdigest() != origin_checksum:
                raise InvalidChecksumException
            if console:
                console.write("checksum/sha256 OK\n")
    return path
--- a/spacy/es/init.py
+++ b/spacy/es/init.py
@ -1,26 +1,20 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 from os import path
 from ..language import Language
 from ..attrs import LANG
-from . import language_data
+
 from .language_data import *
 class Spanish(Language):
    lang = 'es'
    class Defaults(Language.Defaults):
        tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'es'
-        prefixes = tuple(language_data.TOKENIZER_PREFIXES)
+        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
-        
+        stop_words = STOP_WORDS
        suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
        infixes = tuple(language_data.TOKENIZER_INFIXES)
        tag_map = dict(language_data.TAG_MAP)
        stop_words = set(language_data.STOP_WORDS)
--- a/spacy/es/language_data.py
+++ b/spacy/es/language_data.py
@ -1,356 +1,19 @@
 # encoding: utf8
 from __future__ import unicode_literals
-import re
+
 from .. import language_data as base
 from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
-STOP_WORDS = set()
+TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 STOP_WORDS = set(STOP_WORDS)
-TOKENIZER_PREFIXES = map(re.escape, r'''
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
-,
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
 "
 (
 [
 {
 *
 <
 >
 $
 £
 „
 “
 '
 ``
 `
 #
 US$
 C$
 A$
 a-
 ‘
 ....
 ...
 ‚
 »
 _
 §
 '''.strip().split('\n'))
-TOKENIZER_SUFFIXES = r'''
+__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
 ,
 \"
 \)
 \]
 \}
 \*
 \!
 \?
 %
 \$
 >
 :
 ;
 '
 ”
 “
 «
 _
 ''
 's
 'S
 ’s
 ’S
 ’
 ‘
 °
 €
 \.\.
 \.\.\.
 \.\.\.\.
 (?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
 \-\-
 ´
 (?<=[0-9])km²
 (?<=[0-9])m²
 (?<=[0-9])cm²
 (?<=[0-9])mm²
 (?<=[0-9])km³
 (?<=[0-9])m³
 (?<=[0-9])cm³
 (?<=[0-9])mm³
 (?<=[0-9])ha
 (?<=[0-9])km
 (?<=[0-9])m
 (?<=[0-9])cm
 (?<=[0-9])mm
 (?<=[0-9])µm
 (?<=[0-9])nm
 (?<=[0-9])yd
 (?<=[0-9])in
 (?<=[0-9])ft
 (?<=[0-9])kg
 (?<=[0-9])g
 (?<=[0-9])mg
 (?<=[0-9])µg
 (?<=[0-9])t
 (?<=[0-9])lb
 (?<=[0-9])oz
 (?<=[0-9])m/s
 (?<=[0-9])km/h
 (?<=[0-9])mph
 (?<=[0-9])°C
 (?<=[0-9])°K
 (?<=[0-9])°F
 (?<=[0-9])hPa
 (?<=[0-9])Pa
 (?<=[0-9])mbar
 (?<=[0-9])mb
 (?<=[0-9])T
 (?<=[0-9])G
 (?<=[0-9])M
 (?<=[0-9])K
 (?<=[0-9])kb
 '''.strip().split('\n')
 TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
                     r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
                     r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
 TOKENIZER_EXCEPTIONS = {
    "vs.": [{"F": "vs."}],
    "''": [{"F": "''"}],
    "—": [{"F": "—", "L": "--", "pos": "$,"}],
    "a.m.": [{"F": "a.m."}],
    "p.m.": [{"F": "p.m."}],
    "1a.m.": [{"F": "1"}, {"F": "a.m."}],
    "2a.m.": [{"F": "2"}, {"F": "a.m."}],
    "3a.m.": [{"F": "3"}, {"F": "a.m."}],
    "4a.m.": [{"F": "4"}, {"F": "a.m."}],
    "5a.m.": [{"F": "5"}, {"F": "a.m."}],
    "6a.m.": [{"F": "6"}, {"F": "a.m."}],
    "7a.m.": [{"F": "7"}, {"F": "a.m."}],
    "8a.m.": [{"F": "8"}, {"F": "a.m."}],
    "9a.m.": [{"F": "9"}, {"F": "a.m."}],
    "10a.m.": [{"F": "10"}, {"F": "a.m."}],
    "11a.m.": [{"F": "11"}, {"F": "a.m."}],
    "12a.m.": [{"F": "12"}, {"F": "a.m."}],
    "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
    "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
    "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
    "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
    "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
    "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
    "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
    "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
    "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
    "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
    "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
    "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
    "p.m.": [{"F": "p.m."}],
    "1p.m.": [{"F": "1"}, {"F": "p.m."}],
    "2p.m.": [{"F": "2"}, {"F": "p.m."}],
    "3p.m.": [{"F": "3"}, {"F": "p.m."}],
    "4p.m.": [{"F": "4"}, {"F": "p.m."}],
    "5p.m.": [{"F": "5"}, {"F": "p.m."}],
    "6p.m.": [{"F": "6"}, {"F": "p.m."}],
    "7p.m.": [{"F": "7"}, {"F": "p.m."}],
    "8p.m.": [{"F": "8"}, {"F": "p.m."}],
    "9p.m.": [{"F": "9"}, {"F": "p.m."}],
    "10p.m.": [{"F": "10"}, {"F": "p.m."}],
    "11p.m.": [{"F": "11"}, {"F": "p.m."}],
    "12p.m.": [{"F": "12"}, {"F": "p.m."}],
    "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
    "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
    "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
    "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
    "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
    "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
    "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
    "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
    "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
    "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
    "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
    "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
    "Ala.": [{"F": "Ala."}],
    "Ariz.": [{"F": "Ariz."}],
    "Ark.": [{"F":  "Ark."}],
    "Calif.": [{"F": "Calif."}],
    "Colo.": [{"F": "Colo."}],
    "Conn.": [{"F": "Conn."}],
    "Del.": [{"F":  "Del."}],
    "D.C.": [{"F": "D.C."}],
    "Fla.": [{"F":  "Fla."}],
    "Ga.": [{"F": "Ga."}],
    "Ill.": [{"F": "Ill."}],
    "Ind.": [{"F": "Ind."}],
    "Kans.": [{"F": "Kans."}],
    "Kan.": [{"F": "Kan."}],
    "Ky.": [{"F": "Ky."}],
    "La.": [{"F": "La."}],
    "Md.": [{"F": "Md."}],
    "Mass.": [{"F": "Mass."}],
    "Mich.": [{"F": "Mich."}],
    "Minn.": [{"F": "Minn."}],
    "Miss.": [{"F": "Miss."}],
    "Mo.": [{"F": "Mo."}],
    "Mont.": [{"F": "Mont."}],
    "Nebr.": [{"F": "Nebr."}],
    "Neb.": [{"F": "Neb."}],
    "Nev.": [{"F":  "Nev."}],
    "N.H.": [{"F": "N.H."}],
    "N.J.": [{"F": "N.J."}],
    "N.M.": [{"F": "N.M."}],
    "N.Y.": [{"F": "N.Y."}],
    "N.C.": [{"F": "N.C."}],
    "N.D.": [{"F": "N.D."}],
    "Okla.": [{"F": "Okla."}],
    "Ore.": [{"F": "Ore."}],
    "Pa.": [{"F": "Pa."}],
    "Tenn.": [{"F": "Tenn."}],
    "Va.": [{"F": "Va."}],
    "Wash.": [{"F": "Wash."}],
    "Wis.": [{"F": "Wis."}],
    ":)":  [{"F": ":)"}],
    "<3":  [{"F": "<3"}],
    ";)":  [{"F": ";)"}],
    "(:":  [{"F": "(:"}],
    ":(":  [{"F": ":("}],
    "-_-": [{"F": "-_-"}],
    "=)":  [{"F": "=)"}],
    ":/":  [{"F": ":/"}],
    ":>":  [{"F": ":>"}],
    ";-)": [{"F": ";-)"}],
    ":Y":  [{"F": ":Y"}],
    ":P":  [{"F": ":P"}],
    ":-P": [{"F": ":-P"}],
    ":3":  [{"F": ":3"}],
    "=3":  [{"F": "=3"}],
    "xD":  [{"F": "xD"}],
    "^_^": [{"F": "^_^"}],
    "=]":  [{"F": "=]"}],
    "=D":  [{"F": "=D"}],
    "<333":    [{"F": "<333"}],
    ":))": [{"F": ":))"}],
    ":0":  [{"F": ":0"}],
    "-__-":    [{"F": "-__-"}],
    "xDD": [{"F": "xDD"}],
    "o_o": [{"F": "o_o"}],
    "o_O": [{"F": "o_O"}],
    "V_V": [{"F": "V_V"}],
    "=[[": [{"F": "=[["}],
    "<33": [{"F": "<33"}],
    ";p":  [{"F": ";p"}],
    ";D":  [{"F": ";D"}],
    ";-p": [{"F": ";-p"}],
    ";(":  [{"F": ";("}],
    ":p":  [{"F": ":p"}],
    ":]":  [{"F": ":]"}],
    ":O":  [{"F": ":O"}],
    ":-/": [{"F": ":-/"}],
    ":-)": [{"F": ":-)"}],
    ":(((":    [{"F": ":((("}],
    ":((": [{"F": ":(("}],
    ":')": [{"F": ":')"}],
    "(^_^)":   [{"F": "(^_^)"}],
    "(=":  [{"F": "(="}],
    "o.O": [{"F": "o.O"}],
    "\")": [{"F": "\")"}],
    "a.": [{"F": "a."}],
    "b.": [{"F": "b."}],
    "c.": [{"F": "c."}],
    "d.": [{"F": "d."}],
    "e.": [{"F": "e."}],
    "f.": [{"F": "f."}],
    "g.": [{"F": "g."}],
    "h.": [{"F": "h."}],
    "i.": [{"F": "i."}],
    "j.": [{"F": "j."}],
    "k.": [{"F": "k."}],
    "l.": [{"F": "l."}],
    "m.": [{"F": "m."}],
    "n.": [{"F": "n."}],
    "o.": [{"F": "o."}],
    "p.": [{"F": "p."}],
    "q.": [{"F": "q."}],
    "r.": [{"F": "r."}],
    "s.": [{"F": "s."}],
    "t.": [{"F": "t."}],
    "u.": [{"F": "u."}],
    "v.": [{"F": "v."}],
    "w.": [{"F": "w."}],
    "x.": [{"F": "x."}],
    "y.": [{"F": "y."}],
    "z.": [{"F": "z."}],
 }
 TAG_MAP = {
 "$(": {"pos": "PUNCT", "PunctType": "Brck"},
 "$,": {"pos": "PUNCT", "PunctType": "Comm"},
 "$.": {"pos": "PUNCT", "PunctType": "Peri"},
 "ADJA":	{"pos": "ADJ"},
 "ADJD":	{"pos": "ADJ", "Variant": "Short"},
 "ADV":	{"pos": "ADV"},
 "APPO":	{"pos": "ADP", "AdpType": "Post"},
 "APPR":	{"pos": "ADP", "AdpType": "Prep"},
 "APPRART":	{"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
 "APZR":	{"pos": "ADP", "AdpType": "Circ"},
 "ART":	{"pos": "DET", "PronType": "Art"},
 "CARD":	{"pos": "NUM", "NumType": "Card"},
 "FM":	{"pos": "X", "Foreign": "Yes"},
 "ITJ":	{"pos": "INTJ"},
 "KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
 "KON": {"pos": "CONJ"},
 "KOUI":	{"pos": "SCONJ"},
 "KOUS":	{"pos": "SCONJ"},
 "NE": {"pos": "PROPN"},
 "NNE": {"pos": "PROPN"},
 "NN": {"pos": "NOUN"},
 "PAV": {"pos": "ADV", "PronType": "Dem"},
 "PROAV": {"pos": "ADV", "PronType": "Dem"},
 "PDAT":	{"pos": "DET", "PronType": "Dem"},
 "PDS": {"pos": "PRON", "PronType": "Dem"},
 "PIAT":	{"pos": "DET", "PronType": "Ind,Neg,Tot"},
 "PIDAT":	{"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
 "PIS":	{"pos": "PRON", "PronType": "Ind,Neg,Tot"},
 "PPER":	{"pos": "PRON", "PronType": "Prs"},
 "PPOSAT":	{"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
 "PPOSS":	{"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
 "PRELAT":	{"pos": "DET", "PronType": "Rel"},
 "PRELS":	{"pos": "PRON", "PronType": "Rel"},
 "PRF":	{"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
 "PTKA":	{"pos": "PART"},
 "PTKANT":	{"pos": "PART", "PartType": "Res"},
 "PTKNEG":	{"pos": "PART", "Negative": "Neg"},
 "PTKVZ":	{"pos": "PART", "PartType": "Vbp"},
 "PTKZU":	{"pos": "PART", "PartType": "Inf"},
 "PWAT":	{"pos": "DET", "PronType": "Int"},
 "PWAV":	{"pos": "ADV", "PronType": "Int"},
 "PWS":	{"pos": "PRON", "PronType": "Int"},
 "TRUNC":	{"pos": "X", "Hyph": "Yes"},
 "VAFIN":	{"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
 "VAIMP":	{"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
 "VAINF":	{"pos": "AUX", "VerbForm": "Inf"},
 "VAPP":	{"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
 "VMFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
 "VMINF":	{"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
 "VMPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
 "VVFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
 "VVIMP":	{"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
 "VVINF":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVIZU":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
 "XY":	{"pos": "X"},
 "SP": {"pos": "SPACE"}
 }
--- a/spacy/es/stop_words.py
+++ b/spacy/es/stop_words.py
@ -0,0 +1,84 @@
 # encoding: utf8
 from __future__ import unicode_literals
 STOP_WORDS = set("""
 actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
 al algo alguna algunas alguno algunos algún alli allí alrededor ambos ampleamos
 antano antaño ante anterior antes apenas aproximadamente aquel aquella aquellas
 aquello aquellos aqui aquél aquélla aquéllas aquéllos aquí arriba arribaabajo
 aseguró asi así atras aun aunque ayer añadió aún
 bajo bastante bien breve buen buena buenas bueno buenos
 cada casi cerca cierta ciertas cierto ciertos cinco claro comentó como con
 conmigo conocer conseguimos conseguir considera consideró consigo consigue
 consiguen consigues contigo contra cosas creo cual cuales cualquier cuando
 cuanta cuantas cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas
 cuánto cuántos cómo
 da dado dan dar de debajo debe deben debido decir dejó del delante demasiado
 demás dentro deprisa desde despacio despues después detras detrás dia dias dice
 dicen dicho dieron diferente diferentes dijeron dijo dio donde dos durante día
 días dónde
 ejemplo el ella ellas ello ellos embargo empleais emplean emplear empleas
 empleo en encima encuentra enfrente enseguida entonces entre era eramos eran
 eras eres es esa esas ese eso esos esta estaba estaban estado estados estais
 estamos estan estar estará estas este esto estos estoy estuvo está están ex
 excepto existe existen explicó expresó él ésa ésas ése ésos ésta éstas éste
 éstos
 fin final fue fuera fueron fui fuimos
 general gran grandes gueno
 ha haber habia habla hablan habrá había habían hace haceis hacemos hacen hacer
 hacerlo haces hacia haciendo hago han hasta hay haya he hecho hemos hicieron
 hizo horas hoy hubo
 igual incluso indicó informo informó intenta intentais intentamos intentan
 intentar intentas intento ir
 junto
 la lado largo las le lejos les llegó lleva llevar lo los luego lugar
 mal manera manifestó mas mayor me mediante medio mejor mencionó menos menudo mi
 mia mias mientras mio mios mis misma mismas mismo mismos modo momento mucha
 muchas mucho muchos muy más mí mía mías mío míos
 nada nadie ni ninguna ningunas ninguno ningunos ningún no nos nosotras nosotros
 nuestra nuestras nuestro nuestros nueva nuevas nuevo nuevos nunca
 ocho os otra otras otro otros
 pais para parece parte partir pasada pasado paìs peor pero pesar poca pocas
 poco pocos podeis podemos poder podria podriais podriamos podrian podrias podrá
 podrán podría podrían poner por porque posible primer primera primero primeros
 principalmente pronto propia propias propio propios proximo próximo próximos
 pudo pueda puede pueden puedo pues
 qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién quiénes qué
 raras realizado realizar realizó repente respecto
 sabe sabeis sabemos saben saber sabes salvo se sea sean segun segunda segundo
 según seis ser sera será serán sería señaló si sido siempre siendo siete sigue
 siguiente sin sino sobre sois sola solamente solas solo solos somos son soy
 soyos su supuesto sus suya suyas suyo sé sí sólo
 tal tambien también tampoco tan tanto tarde te temprano tendrá tendrán teneis
 tenemos tener tenga tengo tenido tenía tercera ti tiempo tiene tienen toda
 todas todavia todavía todo todos total trabaja trabajais trabajamos trabajan
 trabajar trabajas trabajo tras trata través tres tu tus tuvo tuya tuyas tuyo
 tuyos tú
 ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes
 última últimas último últimos
 va vais valor vamos van varias varios vaya veces ver verdad verdadera verdadero
 vez vosotras vosotros voy vuestra vuestras vuestro vuestros
 ya yo
 """.split())
--- a/spacy/es/tokenizer_exceptions.py
+++ b/spacy/es/tokenizer_exceptions.py
@ -0,0 +1,318 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from ..symbols import *
 from ..language_data import PRON_LEMMA
 TOKENIZER_EXCEPTIONS = {
    "accidentarse": [
        {ORTH: "accidentar", LEMMA: "accidentar", POS: AUX},
        {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "aceptarlo": [
        {ORTH: "aceptar", LEMMA: "aceptar", POS: AUX},
        {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "acompañarla": [
        {ORTH: "acompañar", LEMMA: "acompañar", POS: AUX},
        {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "advertirle": [
        {ORTH: "advertir", LEMMA: "advertir", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "al": [
        {ORTH: "a", LEMMA: "a", POS: ADP},
        {ORTH: "el", LEMMA: "el", POS: DET}
    ],
    "anunciarnos": [
        {ORTH: "anunciar", LEMMA: "anunciar", POS: AUX},
        {ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "asegurándole": [
        {ORTH: "asegurando", LEMMA: "asegurar", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "considerarle": [
        {ORTH: "considerar", LEMMA: "considerar", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "decirle": [
        {ORTH: "decir", LEMMA: "decir", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "decirles": [
        {ORTH: "decir", LEMMA: "decir", POS: AUX},
        {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "decirte": [
        {ORTH: "Decir", LEMMA: "decir", POS: AUX},
        {ORTH: "te", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "dejarla": [
        {ORTH: "dejar", LEMMA: "dejar", POS: AUX},
        {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "dejarnos": [
        {ORTH: "dejar", LEMMA: "dejar", POS: AUX},
        {ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "dejándole": [
        {ORTH: "dejando", LEMMA: "dejar", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "del": [
        {ORTH: "de", LEMMA: "de", POS: ADP},
        {ORTH: "el", LEMMA: "el", POS: DET}
    ],
    "demostrarles": [
        {ORTH: "demostrar", LEMMA: "demostrar", POS: AUX},
        {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "diciéndole": [
        {ORTH: "diciendo", LEMMA: "decir", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "diciéndoles": [
        {ORTH: "diciendo", LEMMA: "decir", POS: AUX},
        {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "diferenciarse": [
        {ORTH: "diferenciar", LEMMA: "diferenciar", POS: AUX},
        {ORTH: "se", LEMMA: "él", POS: PRON}
    ],
    "divirtiéndome": [
        {ORTH: "divirtiendo", LEMMA: "divertir", POS: AUX},
        {ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "ensanchándose": [
        {ORTH: "ensanchando", LEMMA: "ensanchar", POS: AUX},
        {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "explicarles": [
        {ORTH: "explicar", LEMMA: "explicar", POS: AUX},
        {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "haberla": [
        {ORTH: "haber", LEMMA: "haber", POS: AUX},
        {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "haberlas": [
        {ORTH: "haber", LEMMA: "haber", POS: AUX},
        {ORTH: "las", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "haberlo": [
        {ORTH: "haber", LEMMA: "haber", POS: AUX},
        {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "haberlos": [
        {ORTH: "haber", LEMMA: "haber", POS: AUX},
        {ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "haberme": [
        {ORTH: "haber", LEMMA: "haber", POS: AUX},
        {ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "haberse": [
        {ORTH: "haber", LEMMA: "haber", POS: AUX},
        {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "hacerle": [
        {ORTH: "hacer", LEMMA: "hacer", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "hacerles": [
        {ORTH: "hacer", LEMMA: "hacer", POS: AUX},
        {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "hallarse": [
        {ORTH: "hallar", LEMMA: "hallar", POS: AUX},
        {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "imaginaros": [
        {ORTH: "imaginar", LEMMA: "imaginar", POS: AUX},
        {ORTH: "os", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "insinuarle": [
        {ORTH: "insinuar", LEMMA: "insinuar", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "justificarla": [
        {ORTH: "justificar", LEMMA: "justificar", POS: AUX},
        {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "mantenerlas": [
        {ORTH: "mantener", LEMMA: "mantener", POS: AUX},
        {ORTH: "las", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "mantenerlos": [
        {ORTH: "mantener", LEMMA: "mantener", POS: AUX},
        {ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "mantenerme": [
        {ORTH: "mantener", LEMMA: "mantener", POS: AUX},
        {ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "pasarte": [
        {ORTH: "pasar", LEMMA: "pasar", POS: AUX},
        {ORTH: "te", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "pedirle": [
        {ORTH: "pedir", LEMMA: "pedir", POS: AUX},
        {ORTH: "le", LEMMA: "él", POS: PRON}
    ],
    "pel": [
        {ORTH: "per", LEMMA: "per", POS: ADP},
        {ORTH: "el", LEMMA: "el", POS: DET}
    ],
    "pidiéndonos": [
        {ORTH: "pidiendo", LEMMA: "pedir", POS: AUX},
        {ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "poderle": [
        {ORTH: "poder", LEMMA: "poder", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "preguntarse": [
        {ORTH: "preguntar", LEMMA: "preguntar", POS: AUX},
        {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "preguntándose": [
        {ORTH: "preguntando", LEMMA: "preguntar", POS: AUX},
        {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "presentarla": [
        {ORTH: "presentar", LEMMA: "presentar", POS: AUX},
        {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "pudiéndolo": [
        {ORTH: "pudiendo", LEMMA: "poder", POS: AUX},
        {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "pudiéndose": [
        {ORTH: "pudiendo", LEMMA: "poder", POS: AUX},
        {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "quererle": [
        {ORTH: "querer", LEMMA: "querer", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "rasgarse": [
        {ORTH: "Rasgar", LEMMA: "rasgar", POS: AUX},
        {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "repetirlo": [
        {ORTH: "repetir", LEMMA: "repetir", POS: AUX},
        {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "robarle": [
        {ORTH: "robar", LEMMA: "robar", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "seguirlos": [
        {ORTH: "seguir", LEMMA: "seguir", POS: AUX},
        {ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "serle": [
        {ORTH: "ser", LEMMA: "ser", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "serlo": [
        {ORTH: "ser", LEMMA: "ser", POS: AUX},
        {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "señalándole": [
        {ORTH: "señalando", LEMMA: "señalar", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "suplicarle": [
        {ORTH: "suplicar", LEMMA: "suplicar", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "tenerlos": [
        {ORTH: "tener", LEMMA: "tener", POS: AUX},
        {ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "vengarse": [
        {ORTH: "vengar", LEMMA: "vengar", POS: AUX},
        {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "verla": [
        {ORTH: "ver", LEMMA: "ver", POS: AUX},
        {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "verle": [
        {ORTH: "ver", LEMMA: "ver", POS: AUX},
        {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
    ],
    "volverlo": [
        {ORTH: "volver", LEMMA: "volver", POS: AUX},
        {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
    ]
 }
 ORTH_ONLY = [
 ]
--- a/spacy/fi/init.py
+++ b/spacy/fi/init.py
@ -1,9 +0,0 @@
 from __future__ import unicode_literals, print_function
 from os import path
 from ..language import Language
 class Finnish(Language):
    pass
--- a/spacy/fr/init.py
+++ b/spacy/fr/init.py
@ -1,27 +1,20 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 from os import path
 from ..language import Language
 from ..attrs import LANG
-from . import language_data
+
 from .language_data import *
 class French(Language):
    lang = 'fr'
    class Defaults(Language.Defaults):
        tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'fr'
-        prefixes = tuple(language_data.TOKENIZER_PREFIXES)
+        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
-        
+        stop_words = STOP_WORDS
        suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
        infixes = tuple(language_data.TOKENIZER_INFIXES)
        tag_map = dict(language_data.TAG_MAP)
        stop_words = set(language_data.STOP_WORDS)
--- a/spacy/fr/language_data.py
+++ b/spacy/fr/language_data.py
@ -1,356 +1,14 @@
 # encoding: utf8
 from __future__ import unicode_literals
-import re
+
 from .. import language_data as base
 from ..language_data import strings_to_exc
 from .stop_words import STOP_WORDS
-STOP_WORDS = set()
+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)
-TOKENIZER_PREFIXES = map(re.escape, r'''
+__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
 ,
 "
 (
 [
 {
 *
 <
 >
 $
 £
 „
 “
 '
 ``
 `
 #
 US$
 C$
 A$
 a-
 ‘
 ....
 ...
 ‚
 »
 _
 §
 '''.strip().split('\n'))
 TOKENIZER_SUFFIXES = r'''
 ,
 \"
 \)
 \]
 \}
 \*
 \!
 \?
 %
 \$
 >
 :
 ;
 '
 ”
 “
 «
 _
 ''
 's
 'S
 ’s
 ’S
 ’
 ‘
 °
 €
 \.\.
 \.\.\.
 \.\.\.\.
 (?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
 \-\-
 ´
 (?<=[0-9])km²
 (?<=[0-9])m²
 (?<=[0-9])cm²
 (?<=[0-9])mm²
 (?<=[0-9])km³
 (?<=[0-9])m³
 (?<=[0-9])cm³
 (?<=[0-9])mm³
 (?<=[0-9])ha
 (?<=[0-9])km
 (?<=[0-9])m
 (?<=[0-9])cm
 (?<=[0-9])mm
 (?<=[0-9])µm
 (?<=[0-9])nm
 (?<=[0-9])yd
 (?<=[0-9])in
 (?<=[0-9])ft
 (?<=[0-9])kg
 (?<=[0-9])g
 (?<=[0-9])mg
 (?<=[0-9])µg
 (?<=[0-9])t
 (?<=[0-9])lb
 (?<=[0-9])oz
 (?<=[0-9])m/s
 (?<=[0-9])km/h
 (?<=[0-9])mph
 (?<=[0-9])°C
 (?<=[0-9])°K
 (?<=[0-9])°F
 (?<=[0-9])hPa
 (?<=[0-9])Pa
 (?<=[0-9])mbar
 (?<=[0-9])mb
 (?<=[0-9])T
 (?<=[0-9])G
 (?<=[0-9])M
 (?<=[0-9])K
 (?<=[0-9])kb
 '''.strip().split('\n')
 TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
                     r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
                     r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
 TOKENIZER_EXCEPTIONS = {
    "vs.": [{"F": "vs."}],
    "''": [{"F": "''"}],
    "—": [{"F": "—", "L": "--", "pos": "$,"}],
    "a.m.": [{"F": "a.m."}],
    "p.m.": [{"F": "p.m."}],
    "1a.m.": [{"F": "1"}, {"F": "a.m."}],
    "2a.m.": [{"F": "2"}, {"F": "a.m."}],
    "3a.m.": [{"F": "3"}, {"F": "a.m."}],
    "4a.m.": [{"F": "4"}, {"F": "a.m."}],
    "5a.m.": [{"F": "5"}, {"F": "a.m."}],
    "6a.m.": [{"F": "6"}, {"F": "a.m."}],
    "7a.m.": [{"F": "7"}, {"F": "a.m."}],
    "8a.m.": [{"F": "8"}, {"F": "a.m."}],
    "9a.m.": [{"F": "9"}, {"F": "a.m."}],
    "10a.m.": [{"F": "10"}, {"F": "a.m."}],
    "11a.m.": [{"F": "11"}, {"F": "a.m."}],
    "12a.m.": [{"F": "12"}, {"F": "a.m."}],
    "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
    "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
    "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
    "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
    "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
    "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
    "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
    "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
    "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
    "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
    "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
    "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
    "p.m.": [{"F": "p.m."}],
    "1p.m.": [{"F": "1"}, {"F": "p.m."}],
    "2p.m.": [{"F": "2"}, {"F": "p.m."}],
    "3p.m.": [{"F": "3"}, {"F": "p.m."}],
    "4p.m.": [{"F": "4"}, {"F": "p.m."}],
    "5p.m.": [{"F": "5"}, {"F": "p.m."}],
    "6p.m.": [{"F": "6"}, {"F": "p.m."}],
    "7p.m.": [{"F": "7"}, {"F": "p.m."}],
    "8p.m.": [{"F": "8"}, {"F": "p.m."}],
    "9p.m.": [{"F": "9"}, {"F": "p.m."}],
    "10p.m.": [{"F": "10"}, {"F": "p.m."}],
    "11p.m.": [{"F": "11"}, {"F": "p.m."}],
    "12p.m.": [{"F": "12"}, {"F": "p.m."}],
    "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
    "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
    "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
    "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
    "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
    "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
    "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
    "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
    "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
    "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
    "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
    "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
    "Ala.": [{"F": "Ala."}],
    "Ariz.": [{"F": "Ariz."}],
    "Ark.": [{"F":  "Ark."}],
    "Calif.": [{"F": "Calif."}],
    "Colo.": [{"F": "Colo."}],
    "Conn.": [{"F": "Conn."}],
    "Del.": [{"F":  "Del."}],
    "D.C.": [{"F": "D.C."}],
    "Fla.": [{"F":  "Fla."}],
    "Ga.": [{"F": "Ga."}],
    "Ill.": [{"F": "Ill."}],
    "Ind.": [{"F": "Ind."}],
    "Kans.": [{"F": "Kans."}],
    "Kan.": [{"F": "Kan."}],
    "Ky.": [{"F": "Ky."}],
    "La.": [{"F": "La."}],
    "Md.": [{"F": "Md."}],
    "Mass.": [{"F": "Mass."}],
    "Mich.": [{"F": "Mich."}],
    "Minn.": [{"F": "Minn."}],
    "Miss.": [{"F": "Miss."}],
    "Mo.": [{"F": "Mo."}],
    "Mont.": [{"F": "Mont."}],
    "Nebr.": [{"F": "Nebr."}],
    "Neb.": [{"F": "Neb."}],
    "Nev.": [{"F":  "Nev."}],
    "N.H.": [{"F": "N.H."}],
    "N.J.": [{"F": "N.J."}],
    "N.M.": [{"F": "N.M."}],
    "N.Y.": [{"F": "N.Y."}],
    "N.C.": [{"F": "N.C."}],
    "N.D.": [{"F": "N.D."}],
    "Okla.": [{"F": "Okla."}],
    "Ore.": [{"F": "Ore."}],
    "Pa.": [{"F": "Pa."}],
    "Tenn.": [{"F": "Tenn."}],
    "Va.": [{"F": "Va."}],
    "Wash.": [{"F": "Wash."}],
    "Wis.": [{"F": "Wis."}],
    ":)":  [{"F": ":)"}],
    "<3":  [{"F": "<3"}],
    ";)":  [{"F": ";)"}],
    "(:":  [{"F": "(:"}],
    ":(":  [{"F": ":("}],
    "-_-": [{"F": "-_-"}],
    "=)":  [{"F": "=)"}],
    ":/":  [{"F": ":/"}],
    ":>":  [{"F": ":>"}],
    ";-)": [{"F": ";-)"}],
    ":Y":  [{"F": ":Y"}],
    ":P":  [{"F": ":P"}],
    ":-P": [{"F": ":-P"}],
    ":3":  [{"F": ":3"}],
    "=3":  [{"F": "=3"}],
    "xD":  [{"F": "xD"}],
    "^_^": [{"F": "^_^"}],
    "=]":  [{"F": "=]"}],
    "=D":  [{"F": "=D"}],
    "<333":    [{"F": "<333"}],
    ":))": [{"F": ":))"}],
    ":0":  [{"F": ":0"}],
    "-__-":    [{"F": "-__-"}],
    "xDD": [{"F": "xDD"}],
    "o_o": [{"F": "o_o"}],
    "o_O": [{"F": "o_O"}],
    "V_V": [{"F": "V_V"}],
    "=[[": [{"F": "=[["}],
    "<33": [{"F": "<33"}],
    ";p":  [{"F": ";p"}],
    ";D":  [{"F": ";D"}],
    ";-p": [{"F": ";-p"}],
    ";(":  [{"F": ";("}],
    ":p":  [{"F": ":p"}],
    ":]":  [{"F": ":]"}],
    ":O":  [{"F": ":O"}],
    ":-/": [{"F": ":-/"}],
    ":-)": [{"F": ":-)"}],
    ":(((":    [{"F": ":((("}],
    ":((": [{"F": ":(("}],
    ":')": [{"F": ":')"}],
    "(^_^)":   [{"F": "(^_^)"}],
    "(=":  [{"F": "(="}],
    "o.O": [{"F": "o.O"}],
    "\")": [{"F": "\")"}],
    "a.": [{"F": "a."}],
    "b.": [{"F": "b."}],
    "c.": [{"F": "c."}],
    "d.": [{"F": "d."}],
    "e.": [{"F": "e."}],
    "f.": [{"F": "f."}],
    "g.": [{"F": "g."}],
    "h.": [{"F": "h."}],
    "i.": [{"F": "i."}],
    "j.": [{"F": "j."}],
    "k.": [{"F": "k."}],
    "l.": [{"F": "l."}],
    "m.": [{"F": "m."}],
    "n.": [{"F": "n."}],
    "o.": [{"F": "o."}],
    "p.": [{"F": "p."}],
    "q.": [{"F": "q."}],
    "r.": [{"F": "r."}],
    "s.": [{"F": "s."}],
    "t.": [{"F": "t."}],
    "u.": [{"F": "u."}],
    "v.": [{"F": "v."}],
    "w.": [{"F": "w."}],
    "x.": [{"F": "x."}],
    "y.": [{"F": "y."}],
    "z.": [{"F": "z."}],
 }
 TAG_MAP = {
 "$(": {"pos": "PUNCT", "PunctType": "Brck"},
 "$,": {"pos": "PUNCT", "PunctType": "Comm"},
 "$.": {"pos": "PUNCT", "PunctType": "Peri"},
 "ADJA":	{"pos": "ADJ"},
 "ADJD":	{"pos": "ADJ", "Variant": "Short"},
 "ADV":	{"pos": "ADV"},
 "APPO":	{"pos": "ADP", "AdpType": "Post"},
 "APPR":	{"pos": "ADP", "AdpType": "Prep"},
 "APPRART":	{"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
 "APZR":	{"pos": "ADP", "AdpType": "Circ"},
 "ART":	{"pos": "DET", "PronType": "Art"},
 "CARD":	{"pos": "NUM", "NumType": "Card"},
 "FM":	{"pos": "X", "Foreign": "Yes"},
 "ITJ":	{"pos": "INTJ"},
 "KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
 "KON": {"pos": "CONJ"},
 "KOUI":	{"pos": "SCONJ"},
 "KOUS":	{"pos": "SCONJ"},
 "NE": {"pos": "PROPN"},
 "NNE": {"pos": "PROPN"},
 "NN": {"pos": "NOUN"},
 "PAV": {"pos": "ADV", "PronType": "Dem"},
 "PROAV": {"pos": "ADV", "PronType": "Dem"},
 "PDAT":	{"pos": "DET", "PronType": "Dem"},
 "PDS": {"pos": "PRON", "PronType": "Dem"},
 "PIAT":	{"pos": "DET", "PronType": "Ind,Neg,Tot"},
 "PIDAT":	{"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
 "PIS":	{"pos": "PRON", "PronType": "Ind,Neg,Tot"},
 "PPER":	{"pos": "PRON", "PronType": "Prs"},
 "PPOSAT":	{"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
 "PPOSS":	{"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
 "PRELAT":	{"pos": "DET", "PronType": "Rel"},
 "PRELS":	{"pos": "PRON", "PronType": "Rel"},
 "PRF":	{"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
 "PTKA":	{"pos": "PART"},
 "PTKANT":	{"pos": "PART", "PartType": "Res"},
 "PTKNEG":	{"pos": "PART", "Negative": "Neg"},
 "PTKVZ":	{"pos": "PART", "PartType": "Vbp"},
 "PTKZU":	{"pos": "PART", "PartType": "Inf"},
 "PWAT":	{"pos": "DET", "PronType": "Int"},
 "PWAV":	{"pos": "ADV", "PronType": "Int"},
 "PWS":	{"pos": "PRON", "PronType": "Int"},
 "TRUNC":	{"pos": "X", "Hyph": "Yes"},
 "VAFIN":	{"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
 "VAIMP":	{"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
 "VAINF":	{"pos": "AUX", "VerbForm": "Inf"},
 "VAPP":	{"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
 "VMFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
 "VMINF":	{"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
 "VMPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
 "VVFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
 "VVIMP":	{"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
 "VVINF":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVIZU":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
 "XY":	{"pos": "X"},
 "SP": {"pos": "SPACE"}
 }
--- a/spacy/fr/stop_words.py
+++ b/spacy/fr/stop_words.py
@ -0,0 +1,88 @@
 # encoding: utf8
 from __future__ import unicode_literals
 STOP_WORDS = set("""
 a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons
 allô alors anterieur anterieure anterieures apres après as assez attendu au
 aucun aucune aujourd aujourd'hui aupres auquel aura auraient aurait auront
 aussi autre autrefois autrement autres autrui aux auxquelles auxquels avaient
 avais avait avant avec avoir avons ayant
 bah bas basee bat beau beaucoup bien bigre boum bravo brrr
 ça car ce ceci cela celle celle-ci celle-là celles celles-ci celles-là celui
 celui-ci celui-là cent cependant certain certaine certaines certains certes ces
 cet cette ceux ceux-ci ceux-là chacun chacune chaque cher chers chez chiche
 chut chère chères ci cinq cinquantaine cinquante cinquantième cinquième clac
 clic combien comme comment comparable comparables compris concernant contre
 couic crac
 da dans de debout dedans dehors deja delà depuis dernier derniere derriere
 derrière des desormais desquelles desquels dessous dessus deux deuxième
 deuxièmement devant devers devra different differentes differents différent
 différente différentes différents dire directe directement dit dite dits divers
 diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont
 douze douzième dring du duquel durant dès désormais
 effet egale egalement egales eh elle elle-même elles elles-mêmes en encore
 enfin entre envers environ es ès est et etaient étaient etais étais etait était
 etant étant etc été etre être eu euh eux eux-mêmes exactement excepté extenso
 exterieur
 fais faisaient faisant fait façon feront fi flac floc font
 gens
 ha hein hem hep hi ho holà hop hormis hors hou houp hue hui huit huitième hum
 hurrah hé hélas i il ils importe
 je jusqu jusque juste
 la laisser laquelle las le lequel les lesquelles lesquels leur leurs longtemps
 lors lorsque lui lui-meme lui-même là lès
 ma maint maintenant mais malgre malgré maximale me meme memes merci mes mien
 mienne miennes miens mille mince minimale moi moi-meme moi-même moindres moins
 mon moyennant multiple multiples même mêmes
 na naturel naturelle naturelles ne neanmoins necessaire necessairement neuf
 neuvième ni nombreuses nombreux non nos notamment notre nous nous-mêmes nouveau
 nul néanmoins nôtre nôtres
 o ô oh ohé ollé olé on ont onze onzième ore ou ouf ouias oust ouste outre
 ouvert ouverte ouverts où
 paf pan par parce parfois parle parlent parler parmi parseme partant
 particulier particulière particulièrement pas passé pendant pense permet
 personne peu peut peuvent peux pff pfft pfut pif pire plein plouf plus
 plusieurs plutôt possessif possessifs possible possibles pouah pour pourquoi
 pourrais pourrait pouvait prealable precisement premier première premièrement
 pres probable probante procedant proche près psitt pu puis puisque pur pure
 qu quand quant quant-à-soi quanta quarante quatorze quatre quatre-vingt
 quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque
 quelques quels qui quiconque quinze quoi quoique
 rare rarement rares relative relativement remarquable rend rendre restant reste
 restent restrictif retour revoici revoilà rien
 sa sacrebleu sait sans sapristi sauf se sein seize selon semblable semblaient
 semble semblent sent sept septième sera seraient serait seront ses seul seule
 seulement si sien sienne siennes siens sinon six sixième soi soi-même soit
 soixante son sont sous souvent specifique specifiques speculatif stop
 strictement subtiles suffisant suffisante suffit suis suit suivant suivante
 suivantes suivants suivre superpose sur surtout
 ta tac tant tardive te tel telle tellement telles tels tenant tend tenir tente
 tes tic tien tienne tiennes tiens toc toi toi-même ton touchant toujours tous
 tout toute toutefois toutes treize trente tres trois troisième troisièmement
 trop très tsoin tsouin tu té
 un une unes uniformement unique uniques uns
 va vais vas vers via vif vifs vingt vivat vive vives vlan voici voilà vont vos
 votre vous vous-mêmes vu vé vôtre vôtres
 zut
 """.split())
--- a/spacy/it/init.py
+++ b/spacy/it/init.py
@ -1,27 +1,20 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 from os import path
 from ..language import Language
 from ..attrs import LANG
-from . import language_data
+
 from .language_data import *
 class Italian(Language):
    lang = 'it'
    class Defaults(Language.Defaults):
        tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'it'
-        prefixes = tuple(language_data.TOKENIZER_PREFIXES)
+        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
-        
+        stop_words = STOP_WORDS
        suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
        infixes = tuple(language_data.TOKENIZER_INFIXES)
        tag_map = dict(language_data.TAG_MAP)
        stop_words = set(language_data.STOP_WORDS)
--- a/spacy/it/data/tokenizer/infix.txt
+++ b/spacy/it/data/tokenizer/infix.txt
@ -1,3 +0,0 @@
 \.\.\.
 (?<=[a-z])\.(?=[A-Z])
 (?<=[a-zA-Z])-(?=[a-zA-z])
--- a/spacy/it/data/tokenizer/morphs.json
+++ b/spacy/it/data/tokenizer/morphs.json
@ -1,55 +0,0 @@
 {
    "PRP": {
        "I":          {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 1},
        "me":         {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 3},
        "mine":       {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 2},
        "myself":     {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 4},
        "you":        {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 0},
        "yours":      {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 2},
        "yourself":   {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 4},
        "he":         {"L": "-PRON-", "person": 3, "number": 1, "gender": 1, "case": 1},
        "him":        {"L": "-PRON-", "person": 3, "number": 1, "gender": 1, "case": 3},
        "his":        {"L": "-PRON-", "person": 3, "number": 1, "gender": 1, "case": 2},
        "himself":    {"L": "-PRON-", "person": 3, "number": 1, "gender": 1, "case": 4},
        "she":        {"L": "-PRON-", "person": 3, "number": 1, "gender": 2, "case": 1},
        "her":        {"L": "-PRON-", "person": 3, "number": 1, "gender": 2, "case": 3},
        "hers":       {"L": "-PRON-", "person": 3, "number": 1, "gender": 2, "case": 2},
        "herself":    {"L": "-PRON-", "person": 3, "number": 1, "gender": 2, "case": 4},
        "it":         {"L": "-PRON-", "person": 3, "number": 1, "gender": 3, "case": 0},
        "its":        {"L": "-PRON-", "person": 3, "number": 1, "gender": 3, "case": 2},
        "itself":     {"L": "-PRON-", "person": 3, "number": 1, "gender": 3, "case": 4},
        "themself":   {"L": "-PRON-", "person": 3, "number": 1, "gender": 0, "case": 4},
        "we":         {"L": "-PRON-", "person": 1, "number": 2, "gender": 0, "case": 1},
        "us":         {"L": "-PRON-", "person": 1, "number": 2, "gender": 0, "case": 3},
        "ours":       {"L": "-PRON-", "person": 1, "number": 2, "gender": 0, "case": 3},
        "ourselves":  {"L": "-PRON-", "person": 1, "number": 2, "gender": 0, "case": 4},
        "yourselves": {"L": "-PRON-", "person": 2, "number": 2, "gender": 0, "case": 4},
        "they":       {"L": "-PRON-", "person": 3, "number": 2, "gender": 0, "case": 1},
        "them":       {"L": "-PRON-", "person": 3, "number": 2, "gender": 0, "case": 3},
        "their":      {"L": "-PRON-", "person": 3, "number": 2, "gender": 0, "case": 2},
        "themselves": {"L": "-PRON-", "person": 3, "number": 2, "gender": 0, "case": 4}
    },
    "PRP$": {
        "my":    {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 2},
        "your":  {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 2},
        "his":   {"L": "-PRON-", "person": 3, "number": 1, "gender": 1, "case": 2},
        "her":   {"L": "-PRON-", "person": 3, "number": 1, "gender": 2, "case": 2},
        "its":   {"L": "-PRON-", "person": 3, "number": 1, "gender": 3, "case": 2},
        "our":   {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 2},
        "their": {"L": "-PRON-", "person": 3, "number": 2, "gender": 0, "case": 2}
    },
    "JJR": {
        "better": {"L": "good", "misc": 1}
    },
    "JJS": {
        "best": {"L": "good", "misc": 2}
    },
    "RBR": {
        "better": {"L": "good", "misc": 1}
    },
    "RBS": {
        "best": {"L": "good", "misc": 2}
    }
 }
--- a/spacy/it/data/tokenizer/prefix.txt
+++ b/spacy/it/data/tokenizer/prefix.txt
@ -1,21 +0,0 @@
 ,
 "
 (
 [
 {
 *
 <
 $
 £
 “
 '
 ``
 `
 #
 US$
 C$
 A$
 a-
 ‘
 ....
 ...
--- a/spacy/it/data/tokenizer/specials.json
+++ b/spacy/it/data/tokenizer/specials.json
@ -1,647 +0,0 @@
 {
 "'s":  [{"F": "'s", "L": "'s"}],
 "'S":  [{"F": "'S", "L": "'s"}],
 "ain't": [{"F": "ai", "L": "be", "pos": "VBP", "number": 2},
          {"F": "n't", "L": "not", "pos": "RB"}],
 "aint": [{"F": "ai", "L": "be", "pos": "VBP", "number": 2},
          {"F": "nt", "L": "not", "pos": "RB"}],
 "Ain't": [{"F": "Ai", "L": "be", "pos": "VBP", "number": 2},
          {"F": "n't", "L": "not", "pos": "RB"}],
 "aren't": [{"F": "are", "L": "be", "pos": "VBP", "number": 2},
           {"F": "n't", "L": "not"}],
 "arent": [{"F": "are", "L": "be", "pos": "VBP", "number": 2},
           {"F": "nt", "L": "not"}],
 "Aren't": [{"F": "Are", "L": "be", "pos": "VBP", "number": 2},
           {"F": "n't", "L": "not"}],
 "can't": [{"F": "ca", "L": "can", "pos": "MD"},
          {"F": "n't", "L": "not", "pos": "RB"}],
 "cant": [{"F": "ca", "L": "can", "pos": "MD"},
          {"F": "nt", "L": "not", "pos": "RB"}],
 "Can't": [{"F": "Ca", "L": "can", "pos": "MD"},
          {"F": "n't", "L": "not", "pos": "RB"}],
 "cannot":  [{"F": "can", "pos": "MD"},
            {"F": "not", "L": "not", "pos": "RB"}],
 "Cannot":  [{"F": "Can", "pos": "MD"},
            {"F": "not", "L": "not", "pos": "RB"}],
 "could've":    [{"F": "could", "pos": "MD"},
                {"F": "'ve", "L": "have", "pos": "VB"}],
 "couldve":    [{"F": "could", "pos": "MD"},
                {"F": "ve", "L": "have", "pos": "VB"}],
 "Could've":    [{"F": "Could", "pos": "MD"},
                {"F": "'ve", "L": "have", "pos": "VB"}],
 "couldn't":    [{"F": "could", "pos": "MD"},
                {"F": "n't", "L": "not", "pos": "RB"}],
 "couldnt":    [{"F": "could", "pos": "MD"},
                {"F": "nt", "L": "not", "pos": "RB"}],
 "Couldn't":    [{"F": "Could", "pos": "MD"},
                {"F": "n't", "L": "not", "pos": "RB"}],
 "couldn't've": [{"F": "could", "pos": "MD"},
                {"F": "n't", "L": "not", "pos": "RB"},
                {"F": "'ve", "pos": "VB"}],
 "couldntve": [{"F": "could", "pos": "MD"},
                {"F": "nt", "L": "not", "pos": "RB"},
                {"F": "ve", "pos": "VB"}],
 "Couldn't've": [{"F": "Could", "pos": "MD"},
                {"F": "n't", "L": "not", "pos": "RB"},
                {"F": "'ve", "pos": "VB"}],
 "didn't":  [{"F": "did", "pos": "VBD", "L": "do"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "didnt":  [{"F": "did", "pos": "VBD", "L": "do"},
            {"F": "nt", "L": "not", "pos": "RB"}],
 "Didn't":  [{"F": "Did", "pos": "VBD", "L": "do"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "doesn't": [{"F": "does", "L": "do", "pos": "VBZ"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "doesnt": [{"F": "does", "L": "do", "pos": "VBZ"},
            {"F": "nt", "L": "not", "pos": "RB"}],
 "Doesn't": [{"F": "Does", "L": "do", "pos": "VBZ"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "don't":   [{"F": "do", "L": "do"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "dont":   [{"F": "do", "L": "do"},
            {"F": "nt", "L": "not", "pos": "RB"}],
 "Don't":   [{"F": "Do", "L": "do"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "hadn't": [{"F": "had", "L": "have", "pos": "VBD"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "hadnt": [{"F": "had", "L": "have", "pos": "VBD"},
            {"F": "nt", "L": "not", "pos": "RB"}],
 "Hadn't": [{"F": "Had", "L": "have", "pos": "VBD"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "hadn't've": [{"F": "had", "L": "have", "pos": "VBD"},
                {"F": "n't", "L": "not", "pos": "RB"},
                {"F": "'ve", "L": "have", "pos": "VB"}],
 "hasn't": [{"F": "has"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "hasnt": [{"F": "has"},
            {"F": "nt", "L": "not", "pos": "RB"}],
 "haven't": [{"F": "have", "pos": "VB"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "havent": [{"F": "have", "pos": "VB"},
            {"F": "nt", "L": "not", "pos": "RB"}],
 "he'd": [{"F": "he", "L": "-PRON-"},
            {"F": "'d", "L": "would", "pos": "MD"}],
 "hed": [{"F": "he", "L": "-PRON-"},
            {"F": "d", "L": "would", "pos": "MD"}],
 "he'd've": [{"F": "he", "L": "-PRON-"},
            {"F": "'d", "L": "would", "pos": "MD"},
            {"F": "'ve", "pos": "VB"}],
 "hedve": [{"F": "he", "L": "-PRON-"},
            {"F": "d", "L": "would", "pos": "MD"},
            {"F": "ve", "pos": "VB"}],
 "he'll": [{"F": "he", "L": "-PRON-"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "he's": [{"F": "he", "L": "-PRON-"},
            {"F": "'s"}],
 "hes": [{"F": "he", "L": "-PRON-"},
            {"F": "s"}],
 "how'd": [{"F": "how"},
            {"F": "'d", "L": "would", "pos": "MD"}],
 "howd": [{"F": "how"},
            {"F": "d", "L": "would", "pos": "MD"}],
 "how'll": [{"F": "how"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "howll": [{"F": "how"},
            {"F": "ll", "L": "will", "pos": "MD"}],
 "how's": [{"F": "how"},
            {"F": "'s"}],
 "hows": [{"F": "how"},
            {"F": "s"}],
 "I'd": [{"F": "I", "L": "-PRON-"},
        {"F": "'d", "L": "would", "pos": "MD"}],
 "I'd've": [{"F": "I", "L": "-PRON-"},
            {"F": "'d", "L": "would", "pos": "MD"},
            {"F": "'ve", "pos": "VB"}],
 "I'll": [{"F": "I", "L": "-PRON-"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "i'll": [{"F": "i", "L": "-PRON-"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "I'm": [{"F": "I", "L": "-PRON-"},
        {"F": "'m", "L": "be", "pos": "VBP", "number": 1, "tenspect": 1}],
 "i'm": [{"F": "i", "L": "-PRON-"},
        {"F": "'m", "L": "be", "pos": "VBP", "number": 1, "tenspect": 1}],
 "Im": [{"F": "I", "L": "-PRON-"},
        {"F": "m", "L": "be", "pos": "VBP", "number": 1, "tenspect": 1}],
 "im": [{"F": "i", "L": "-PRON-"},
        {"F": "m", "L": "be", "pos": "VBP", "number": 1, "tenspect": 1}],
 "I'ma": [{"F": "I", "L": "-PRON-"},
            {"F": "'ma"}],
 "i'ma": [{"F": "i", "L": "-PRON-"},
            {"F": "'ma"}],
 "I've": [{"F": "I", "L": "-PRON-"},
            {"F": "'ve", "pos": "VB", "L": "have", "pos": "MD"}],
 "i've": [{"F": "i", "L": "-PRON-"},
            {"F": "'ve", "pos": "VB", "L": "have", "pos": "MD"}],
 "isn't": [{"F": "is", "L": "be", "pos": "VBZ"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "isnt": [{"F": "is", "L": "be", "pos": "VBZ"},
            {"F": "nt", "L": "not", "pos": "RB"}],
 "Isn't": [{"F": "Is", "L": "be", "pos": "VBZ"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "It'd": [{"F": "It", "L": "-PRON-"},
            {"F": "'d", "L": "would", "pos": "MD"}],
 "it'd": [{"F": "it", "L": "-PRON-"},
            {"F": "'d", "L": "would", "pos": "MD"}],
 "it'd've": [{"F": "it", "L": "-PRON-"},
            {"F": "'d", "L": "would", "pos": "MD"},
            {"F": "'ve"}],
 "it'll": [{"F": "it", "L": "-PRON-"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "itll": [{"F": "it", "L": "-PRON-"},
            {"F": "ll", "L": "will", "pos": "MD"}],
 "it's": [{"F": "it", "L": "-PRON-"},
            {"F": "'s"}],
 "let's": [{"F": "let"},
            {"F": "'s"}],
 "lets": [{"F": "let"},
            {"F": "s", "L": "'s"}],
 "mightn't": [{"F": "might"},
                {"F": "n't", "L": "not", "pos": "RB"}],
 "mightn't've": [{"F": "might"},
                {"F": "n't", "L": "not", "pos": "RB"},
                {"F": "'ve", "pos": "VB"}],
 "might've": [{"F": "might"},
                {"F": "'ve", "pos": "VB"}],
 "mustn't": [{"F": "must"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "must've": [{"F": "must"},
            {"F": "'ve", "pos": "VB"}],
 "needn't": [{"F": "need"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "not've":  [{"F": "not"},
            {"F": "'ve", "pos": "VB"}],
 "shan't":  [{"F": "sha"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "she'd": [{"F": "she", "L": "-PRON-"},
            {"F": "'d", "L": "would", "pos": "MD"}],
 "she'd've": [{"F": "she", "L": "-PRON-"},
                {"F": "'d", "L": "would", "pos": "MD"},
                {"F": "'ve", "pos": "VB"}],
 "she'll": [{"F": "she", "L": "-PRON-"},
            {"F": "'ll", "L": "will"}],
 "she's": [{"F": "she", "L": "-PRON-"},
            {"F": "'s"}],
 "should've": [{"F": "should"},
                {"F": "'ve", "pos": "VB"}],
 "shouldn't": [{"F": "should"},
                {"F": "n't", "L": "not", "pos": "RB"}],
 "shouldn't've": [{"F": "should"},
                    {"F": "n't", "L": "not", "pos": "RB"},
                    {"F": "'ve"}],
 "that's":  [{"F": "that"},
            {"F": "'s"}],
 "thats":  [{"F": "that"},
            {"F": "s", "L": "'s"}],
 "there'd": [{"F": "there"},
            {"F": "'d", "L": "would", "pos": "MD"}],
 "there'd've":  [{"F": "there"},
                {"F": "'d", "L": "would", "pos": "MD"},
                {"F": "'ve", "pos": "VB"}],
 "there's": [{"F": "there"},
            {"F": "'s"}],
 "they'd":  [{"F": "they", "L": "-PRON-"},
            {"F": "'d", "L": "would", "pos": "MD", "pos": "VB"}],
 "They'd":  [{"F": "They", "L": "-PRON-"},
            {"F": "'d", "L": "would", "pos": "MD", "pos": "VB"}],
 "they'd've":   [{"F": "they", "L": "-PRON-"},
                {"F": "'d", "L": "would", "pos": "MD"},
                {"F": "'ve", "pos": "VB"}],
 "They'd've":   [{"F": "They", "L": "-PRON-"},
                {"F": "'d", "L": "would", "pos": "MD"},
                {"F": "'ve", "pos": "VB"}],
 "they'll": [{"F": "they", "L": "-PRON-"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "They'll": [{"F": "They", "L": "-PRON-"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "they're": [{"F": "they", "L": "-PRON-"},
            {"F": "'re"}],
 "They're": [{"F": "They", "L": "-PRON-"},
            {"F": "'re"}],
 "they've": [{"F": "they", "L": "-PRON-"},
            {"F": "'ve", "pos": "VB"}],
 "They've": [{"F": "They", "L": "-PRON-"},
            {"F": "'ve", "pos": "VB"}],
 "wasn't":  [{"F": "was"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "we'd":    [{"F": "we"},
            {"F": "'d", "L": "would", "pos": "MD"}],
 "We'd":    [{"F": "We"},
            {"F": "'d", "L": "would", "pos": "MD"}],
 "we'd've": [{"F": "we"},
            {"F": "'d", "L": "would", "pos": "MD"},
            {"F": "'ve", "pos": "VB"}],
 "we'll":   [{"F": "we"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "We'll":   [{"F": "We", "L": "we"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "we're":   [{"F": "we"},
            {"F": "'re"}],
 "We're":   [{"F": "We"},
            {"F": "'re"}],
 "we've":   [{"F": "we"},
            {"F": "'ve", "pos": "VB"}],
 "We've":   [{"F": "We"},
            {"F": "'ve", "pos": "VB"}],
 "weren't": [{"F": "were"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "what'll": [{"F": "what"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "what're": [{"F": "what"},
            {"F": "'re"}],
 "what's":  [{"F": "what"},
            {"F": "'s"}],
 "what've": [{"F": "what"},
            {"F": "'ve", "pos": "VB"}],
 "when's":  [{"F": "when"},
            {"F": "'s"}],
 "where'd": [{"F": "where"},
            {"F": "'d", "L": "would", "pos": "MD"}],
 "where's": [{"F": "where"},
            {"F": "'s"}],
 "where've": [{"F": "where"},
             {"F": "'ve", "pos": "VB"}],
 "who'd":   [{"F": "who"},
            {"F": "'d", "L": "would", "pos": "MD"}],
 "who'll":  [{"F": "who"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "who're":  [{"F": "who"},
            {"F": "'re"}],
 "who's":   [{"F": "who"},
            {"F": "'s"}],
 "who've":  [{"F": "who"},
            {"F": "'ve", "pos": "VB"}],
 "why'll":  [{"F": "why"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "why're":  [{"F": "why"},
            {"F": "'re"}],
 "why's":   [{"F": "why"},
            {"F": "'s"}],
 "won't":   [{"F": "wo"},
            {"F": "n't", "L": "not", "pos": "RB"}],
 "wont":   [{"F": "wo"},
            {"F": "nt", "L": "not", "pos": "RB"}],
 "would've":    [{"F": "would"},
                {"F": "'ve", "pos": "VB"}],
 "wouldn't":    [{"F": "would"},
                {"F": "n't", "L": "not", "pos": "RB"}],
 "wouldn't've": [{"F": "would"},
                {"F": "n't", "L": "not", "pos": "RB"},
                {"F": "'ve", "L": "have", "pos": "VB"}],
 "you'd":   [{"F": "you", "L": "-PRON-"},
            {"F": "'d", "L": "would", "pos": "MD"}],
 "you'd've":    [{"F": "you", "L": "-PRON-"},
                {"F": "'d", "L": "would", "pos": "MD"},
                {"F": "'ve", "L": "have", "pos": "VB"}],
 "you'll":  [{"F": "you", "L": "-PRON-"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "You'll":  [{"F": "You", "L": "-PRON-"},
            {"F": "'ll", "L": "will", "pos": "MD"}],
 "you're":  [{"F": "you", "L": "-PRON-"},
            {"F": "'re"}],
 "You're":  [{"F": "You", "L": "-PRON-"},
            {"F": "'re"}],
 "you've":  [{"F": "you", "L": "-PRON-"},
            {"F": "'ve", "L": "have", "pos": "VB"}],
 "You've":  [{"F": "You", "L": "-PRON-"},
            {"F": "'ve", "L": "have", "pos": "VB"}],
 "'em": [{"F": "'em"}],
 "'ol": [{"F": "'ol"}],
 "vs.": [{"F": "vs."}],
 "Ms.": [{"F": "Ms."}],
 "Mr.": [{"F": "Mr."}],
 "Dr.": [{"F": "Dr."}],
 "Mrs.": [{"F": "Mrs."}],
 "Messrs.": [{"F": "Messrs."}],
 "Gov.": [{"F": "Gov."}],
 "Gen.": [{"F": "Gen."}],
 "Mt.": [{"F": "Mt.", "L": "Mount"}],
 "''": [{"F": "''"}],
 "Corp.": [{"F": "Corp."}],
 "Inc.": [{"F": "Inc."}],
 "Co.": [{"F": "Co."}],
 "co.": [{"F": "co."}],
 "Ltd.": [{"F": "Ltd."}],
 "Bros.": [{"F": "Bros."}],
 "Rep.": [{"F": "Rep."}],
 "Sen.": [{"F": "Sen."}],
 "Jr.": [{"F": "Jr."}],
 "Rev.": [{"F": "Rev."}],
 "Adm.": [{"F": "Adm."}],
 "St.": [{"F": "St."}],
 "a.m.": [{"F": "a.m."}],
 "p.m.": [{"F": "p.m."}],
 "1a.m.": [{"F": "1"}, {"F": "a.m."}],
 "2a.m.": [{"F": "2"}, {"F": "a.m."}],
 "3a.m.": [{"F": "3"}, {"F": "a.m."}],
 "4a.m.": [{"F": "4"}, {"F": "a.m."}],
 "5a.m.": [{"F": "5"}, {"F": "a.m."}],
 "6a.m.": [{"F": "6"}, {"F": "a.m."}],
 "7a.m.": [{"F": "7"}, {"F": "a.m."}],
 "8a.m.": [{"F": "8"}, {"F": "a.m."}],
 "9a.m.": [{"F": "9"}, {"F": "a.m."}],
 "10a.m.": [{"F": "10"}, {"F": "a.m."}],
 "11a.m.": [{"F": "11"}, {"F": "a.m."}],
 "12a.m.": [{"F": "12"}, {"F": "a.m."}],
 "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
 "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
 "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
 "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
 "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
 "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
 "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
 "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
 "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
 "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
 "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
 "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
 "p.m.": [{"F": "p.m."}],
 "1p.m.": [{"F": "1"}, {"F": "p.m."}],
 "2p.m.": [{"F": "2"}, {"F": "p.m."}],
 "3p.m.": [{"F": "3"}, {"F": "p.m."}],
 "4p.m.": [{"F": "4"}, {"F": "p.m."}],
 "5p.m.": [{"F": "5"}, {"F": "p.m."}],
 "6p.m.": [{"F": "6"}, {"F": "p.m."}],
 "7p.m.": [{"F": "7"}, {"F": "p.m."}],
 "8p.m.": [{"F": "8"}, {"F": "p.m."}],
 "9p.m.": [{"F": "9"}, {"F": "p.m."}],
 "10p.m.": [{"F": "10"}, {"F": "p.m."}],
 "11p.m.": [{"F": "11"}, {"F": "p.m."}],
 "12p.m.": [{"F": "12"}, {"F": "p.m."}],
 "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
 "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
 "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
 "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
 "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
 "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
 "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
 "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
 "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
 "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
 "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
 "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
 "Jan.": [{"F": "Jan."}],
 "Feb.": [{"F": "Feb."}],
 "Mar.": [{"F": "Mar."}],
 "Apr.": [{"F": "Apr."}],
 "May.": [{"F": "May."}],
 "Jun.": [{"F": "Jun."}],
 "Jul.": [{"F": "Jul."}],
 "Aug.": [{"F": "Aug."}],
 "Sep.": [{"F": "Sep."}],
 "Sept.": [{"F": "Sept."}],
 "Oct.": [{"F": "Oct."}],
 "Nov.": [{"F": "Nov."}],
 "Dec.": [{"F": "Dec."}],
 "Ala.": [{"F": "Ala."}],
 "Ariz.": [{"F": "Ariz."}],
 "Ark.": [{"F":  "Ark."}],
 "Calif.": [{"F": "Calif."}],
 "Colo.": [{"F": "Colo."}],
 "Conn.": [{"F": "Conn."}],
 "Del.": [{"F":  "Del."}],
 "D.C.": [{"F": "D.C."}],
 "Fla.": [{"F":  "Fla."}],
 "Ga.": [{"F": "Ga."}],
 "Ill.": [{"F": "Ill."}],
 "Ind.": [{"F": "Ind."}],
 "Kans.": [{"F": "Kans."}],
 "Kan.": [{"F": "Kan."}],
 "Ky.": [{"F": "Ky."}],
 "La.": [{"F": "La."}],
 "Md.": [{"F": "Md."}],
 "Mass.": [{"F": "Mass."}],
 "Mich.": [{"F": "Mich."}],
 "Minn.": [{"F": "Minn."}],
 "Miss.": [{"F": "Miss."}],
 "Mo.": [{"F": "Mo."}],
 "Mont.": [{"F": "Mont."}],
 "Nebr.": [{"F": "Nebr."}],
 "Neb.": [{"F": "Neb."}],
 "Nev.": [{"F":  "Nev."}],
 "N.H.": [{"F": "N.H."}],
 "N.J.": [{"F": "N.J."}],
 "N.M.": [{"F": "N.M."}],
 "N.Y.": [{"F": "N.Y."}],
 "N.C.": [{"F": "N.C."}],
 "N.D.": [{"F": "N.D."}],
 "Okla.": [{"F": "Okla."}],
 "Ore.": [{"F": "Ore."}],
 "Pa.": [{"F": "Pa."}],
 "Tenn.": [{"F": "Tenn."}],
 "Va.": [{"F": "Va."}],
 "Wash.": [{"F": "Wash."}],
 "Wis.": [{"F": "Wis."}],
 ":)":  [{"F": ":)"}],
 "<3":  [{"F": "<3"}],
 ";)":  [{"F": ";)"}],
 "(:":  [{"F": "(:"}],
 ":(":  [{"F": ":("}],
 "-_-": [{"F": "-_-"}],
 "=)":  [{"F": "=)"}],
 ":/":  [{"F": ":/"}],
 ":>":  [{"F": ":>"}],
 ";-)": [{"F": ";-)"}],
 ":Y":  [{"F": ":Y"}],
 ":P":  [{"F": ":P"}],
 ":-P": [{"F": ":-P"}],
 ":3":  [{"F": ":3"}],
 "=3":  [{"F": "=3"}],
 "xD":  [{"F": "xD"}],
 "^_^": [{"F": "^_^"}],
 "=]":  [{"F": "=]"}],
 "=D":  [{"F": "=D"}],
 "<333":    [{"F": "<333"}],
 ":))": [{"F": ":))"}],
 ":0":  [{"F": ":0"}],
 "-__-":    [{"F": "-__-"}],
 "xDD": [{"F": "xDD"}],
 "o_o": [{"F": "o_o"}],
 "o_O": [{"F": "o_O"}],
 "V_V": [{"F": "V_V"}],
 "=[[": [{"F": "=[["}],
 "<33": [{"F": "<33"}],
 ";p":  [{"F": ";p"}],
 ";D":  [{"F": ";D"}],
 ";-p": [{"F": ";-p"}],
 ";(":  [{"F": ";("}],
 ":p":  [{"F": ":p"}],
 ":]":  [{"F": ":]"}],
 ":O":  [{"F": ":O"}],
 ":-/": [{"F": ":-/"}],
 ":-)": [{"F": ":-)"}],
 ":(((":    [{"F": ":((("}],
 ":((": [{"F": ":(("}],
 ":')": [{"F": ":')"}],
 "(^_^)":   [{"F": "(^_^)"}],
 "(=":  [{"F": "(="}],
 "o.O": [{"F": "o.O"}],
 "\")": [{"F": "\")"}],
 "a.": [{"F": "a."}],
 "b.": [{"F": "b."}],
 "c.": [{"F": "c."}],
 "d.": [{"F": "d."}],
 "e.": [{"F": "e."}],
 "f.": [{"F": "f."}],
 "g.": [{"F": "g."}],
 "h.": [{"F": "h."}],
 "i.": [{"F": "i."}],
 "j.": [{"F": "j."}],
 "k.": [{"F": "k."}],
 "l.": [{"F": "l."}],
 "m.": [{"F": "m."}],
 "n.": [{"F": "n."}],
 "o.": [{"F": "o."}],
 "p.": [{"F": "p."}],
 "q.": [{"F": "q."}],
 "s.": [{"F": "s."}],
 "t.": [{"F": "t."}],
 "u.": [{"F": "u."}],
 "v.": [{"F": "v."}],
 "w.": [{"F": "w."}],
 "x.": [{"F": "x."}],
 "y.": [{"F": "y."}],
 "z.": [{"F": "z."}],
 "i.e.": [{"F": "i.e."}],
 "I.e.": [{"F": "I.e."}],
 "I.E.": [{"F": "I.E."}],
 "e.g.": [{"F": "e.g."}],
 "E.g.": [{"F": "E.g."}],
 "E.G.": [{"F": "E.G."}],
 "\n": [{"F": "\n", "pos": "SP"}],
 "\t": [{"F": "\t", "pos": "SP"}],
 " ": [{"F": " ", "pos": "SP"}]
 }
--- a/spacy/it/data/tokenizer/suffix.txt
+++ b/spacy/it/data/tokenizer/suffix.txt
@ -1,26 +0,0 @@
 ,
 \"
 \)
 \]
 \}
 \*
 \!
 \?
 %
 \$
 >
 :
 ;
 '
 ”
 ''
 's
 'S
 ’s
 ’S
 ’
 \.\.
 \.\.\.
 \.\.\.\.
 (?<=[a-z0-9)\]"'%\)])\.
 (?<=[0-9])km
--- a/spacy/it/data/vocab/gazetteer.json
+++ b/spacy/it/data/vocab/gazetteer.json
@ -1,198 +0,0 @@
 {
 	"Reddit": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "reddit"}]
 		]
 	],
 	"SeptemberElevenAttacks": [
 		"EVENT",
 		{},
 		[
 			[
 				{"orth": "9/11"}
 			],
 			[
 				{"lower": "Septmber"},
 				{"lower": "Eleven"}
 			],
 			[
 				{"lower": "september"},
 				{"orth": "11"}
 			]
 		]
 	],
 	"Linux": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "linux"}]
 		]
 	],
 	"Haskell": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "haskell"}]
 		]
 	],
 	"HaskellCurry": [
 		"PERSON",
 		{},
 		[
 			[
 				{"lower": "haskell"},
 				{"lower": "curry"}
 			]
 		]
 	],
 	"Javascript": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "javascript"}]
 		]
 	],
 	"CSS": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "css"}],
 			[{"lower": "css3"}]
 		]
 	],
 	"displaCy": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "displacy"}]
 		]
 	],
 	"spaCy": [
 		"PRODUCT",
 		{},
 		[
 			[{"orth": "spaCy"}]
 		]
 	],
    "HTML": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "html"}],
 			[{"lower": "html5"}]
 		]
 	],
    "Python": [
        "PRODUCT",
        {},
        [
            [{"orth": "Python"}]
        ]
    ],
    "Ruby": [
        "PRODUCT",
        {},
        [
            [{"orth": "Ruby"}]
        ]
    ],
    "Digg": [
        "PRODUCT",
        {},
        [
            [{"lower": "digg"}]
        ]
    ],
     "FoxNews": [
        "ORG",
        {},
        [
            [{"orth": "Fox"}],
            [{"orth": "News"}]
        ]
    ],
    "Google": [
        "ORG",
        {},
        [
            [{"lower": "google"}]
        ]
    ],
    "Mac": [
        "PRODUCT",
        {},
        [
            [{"lower": "mac"}]
        ]
    ],
    "Wikipedia": [
        "PRODUCT",
        {},
        [
            [{"lower": "wikipedia"}]
        ]
    ],
    "Windows": [
        "PRODUCT",
        {},
        [
            [{"orth": "Windows"}]
        ]
    ],
     "Dell": [
        "ORG",
        {},
        [
            [{"lower": "dell"}]
        ]
    ],
    "Facebook": [
        "ORG",
        {},
        [
            [{"lower": "facebook"}]
        ]
    ],
     "Blizzard": [
        "ORG",
        {},
        [
            [{"orth": "Facebook"}]
        ]
    ],
    "Ubuntu": [
        "ORG",
        {},
        [
            [{"orth": "Ubuntu"}]
        ]
    ],
    "Youtube": [
        "PRODUCT",
        {},
        [
            [{"lower": "youtube"}]
        ]
    ],
    "false_positives": [
        null,
        {},
        [
            [{"orth": "Shit"}],
            [{"orth": "Weed"}],
            [{"orth": "Cool"}],
            [{"orth": "Btw"}],
            [{"orth": "Bah"}],
            [{"orth": "Bullshit"}],
            [{"orth": "Lol"}],
            [{"orth": "Yo"}, {"lower": "dawg"}],
            [{"orth": "Yay"}],
            [{"orth": "Ahh"}],
            [{"orth": "Yea"}],
            [{"orth": "Bah"}]
        ]
    ]
 }
--- a/spacy/it/data/vocab/lemma_rules.json
+++ b/spacy/it/data/vocab/lemma_rules.json
@ -1,31 +0,0 @@
 {
    "noun": [
        ["s", ""],
        ["ses", "s"],
        ["ves", "f"],
        ["xes", "x"],
        ["zes", "z"],
        ["ches", "ch"],
        ["shes", "sh"],
        ["men", "man"],
        ["ies", "y"]
    ],
    "verb": [
        ["s", ""],
        ["ies", "y"],
        ["es", "e"],
        ["es", ""],
        ["ed", "e"],
        ["ed", ""],
        ["ing", "e"],
        ["ing", ""]
    ],
    "adj": [
        ["er", ""],
        ["est", ""],
        ["er", "e"],
        ["est", "e"]
    ]
 }
--- a/spacy/it/data/vocab/lexemes.bin
+++ b/spacy/it/data/vocab/lexemes.bin
--- a/spacy/it/data/vocab/oov_prob
+++ b/spacy/it/data/vocab/oov_prob
@ -1 +0,0 @@
 -20.000000
--- a/spacy/it/data/vocab/strings.txt
+++ b/spacy/it/data/vocab/strings.txt
--- a/spacy/it/data/vocab/tag_map.json
+++ b/spacy/it/data/vocab/tag_map.json
@ -1,56 +0,0 @@
 {
 "$(": {"pos": "PUNCT", "PunctType": "Brck"},
 "$,": {"pos": "PUNCT", "PunctType": "Comm"},
 "$.": {"pos": "PUNCT", "PunctType": "Peri"},
 "ADJA":	{"pos": "ADJ"},
 "ADJD":	{"pos": "ADJ", "Variant": "Short"},
 "ADV":	{"pos": "ADV"},
 "APPO":	{"pos": "ADP", "AdpType": "Post"},
 "APPR":	{"pos": "ADP", "AdpType": "Prep"},
 "APPRART":	{"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
 "APZR":	{"pos": "ADP", "AdpType": "Circ"},
 "ART":	{"pos": "DET", "PronType": "Art"},
 "CARD":	{"pos": "NUM", "NumType": "Card"},
 "FM":	{"pos": "X", "Foreign": "Yes"},
 "ITJ":	{"pos": "INTJ"},
 "KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
 "KON": {"pos": "CONJ"},
 "KOUI":	{"pos": "SCONJ"},
 "KOUS":	{"pos": "SCONJ"},
 "NE": {"pos": "PROPN"},
 "NN": {"pos": "NOUN"},
 "PAV": {"pos": "ADV", "PronType": "Dem"},
 "PDAT":	{"pos": "DET", "PronType": "Dem"},
 "PDS": {"pos": "PRON", "PronType": "Dem"},
 "PIAT":	{"pos": "DET", "PronType": "Ind,Neg,Tot"},
 "PIDAT":	{"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
 "PIS":	{"pos": "PRON", "PronType": "Ind,Neg,Tot"},
 "PPER":	{"pos": "PRON", "PronType": "Prs"},
 "PPOSAT":	{"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
 "PPOSS":	{"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
 "PRELAT":	{"pos": "DET", "PronType": "Rel"},
 "PRELS":	{"pos": "PRON", "PronType": "Rel"},
 "PRF":	{"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
 "PTKA":	{"pos": "PART"},
 "PTKANT":	{"pos": "PART", "PartType": "Res"},
 "PTKNEG":	{"pos": "PART", "Negative": "Neg"},
 "PTKVZ":	{"pos": "PART", "PartType": "Vbp"},
 "PTKZU":	{"pos": "PART", "PartType": "Inf"},
 "PWAT":	{"pos": "DET", "PronType": "Int"},
 "PWAV":	{"pos": "ADV", "PronType": "Int"},
 "PWS":	{"pos": "PRON", "PronType": "Int"},
 "TRUNC":	{"pos": "X", "Hyph": "Yes"},
 "VAFIN":	{"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
 "VAIMP":	{"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
 "VAINF":	{"pos": "AUX", "VerbForm": "Inf"},
 "VAPP":	{"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
 "VMFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
 "VMINF":	{"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
 "VMPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
 "VVFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
 "VVIMP":	{"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
 "VVINF":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVIZU":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
 "XY":	{"pos": "X"}
 }
--- a/spacy/it/language_data.py
+++ b/spacy/it/language_data.py
@ -1,356 +1,14 @@
 # encoding: utf8
 from __future__ import unicode_literals
-import re
+
 from .. import language_data as base
 from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS
-STOP_WORDS = set()
+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)
-TOKENIZER_PREFIXES = map(re.escape, r'''
+__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
 ,
 "
 (
 [
 {
 *
 <
 >
 $
 £
 „
 “
 '
 ``
 `
 #
 US$
 C$
 A$
 a-
 ‘
 ....
 ...
 ‚
 »
 _
 §
 '''.strip().split('\n'))
 TOKENIZER_SUFFIXES = r'''
 ,
 \"
 \)
 \]
 \}
 \*
 \!
 \?
 %
 \$
 >
 :
 ;
 '
 ”
 “
 «
 _
 ''
 's
 'S
 ’s
 ’S
 ’
 ‘
 °
 €
 \.\.
 \.\.\.
 \.\.\.\.
 (?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
 \-\-
 ´
 (?<=[0-9])km²
 (?<=[0-9])m²
 (?<=[0-9])cm²
 (?<=[0-9])mm²
 (?<=[0-9])km³
 (?<=[0-9])m³
 (?<=[0-9])cm³
 (?<=[0-9])mm³
 (?<=[0-9])ha
 (?<=[0-9])km
 (?<=[0-9])m
 (?<=[0-9])cm
 (?<=[0-9])mm
 (?<=[0-9])µm
 (?<=[0-9])nm
 (?<=[0-9])yd
 (?<=[0-9])in
 (?<=[0-9])ft
 (?<=[0-9])kg
 (?<=[0-9])g
 (?<=[0-9])mg
 (?<=[0-9])µg
 (?<=[0-9])t
 (?<=[0-9])lb
 (?<=[0-9])oz
 (?<=[0-9])m/s
 (?<=[0-9])km/h
 (?<=[0-9])mph
 (?<=[0-9])°C
 (?<=[0-9])°K
 (?<=[0-9])°F
 (?<=[0-9])hPa
 (?<=[0-9])Pa
 (?<=[0-9])mbar
 (?<=[0-9])mb
 (?<=[0-9])T
 (?<=[0-9])G
 (?<=[0-9])M
 (?<=[0-9])K
 (?<=[0-9])kb
 '''.strip().split('\n')
 TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
                     r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
                     r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
 TOKENIZER_EXCEPTIONS = {
    "vs.": [{"F": "vs."}],
    "''": [{"F": "''"}],
    "—": [{"F": "—", "L": "--", "pos": "$,"}],
    "a.m.": [{"F": "a.m."}],
    "p.m.": [{"F": "p.m."}],
    "1a.m.": [{"F": "1"}, {"F": "a.m."}],
    "2a.m.": [{"F": "2"}, {"F": "a.m."}],
    "3a.m.": [{"F": "3"}, {"F": "a.m."}],
    "4a.m.": [{"F": "4"}, {"F": "a.m."}],
    "5a.m.": [{"F": "5"}, {"F": "a.m."}],
    "6a.m.": [{"F": "6"}, {"F": "a.m."}],
    "7a.m.": [{"F": "7"}, {"F": "a.m."}],
    "8a.m.": [{"F": "8"}, {"F": "a.m."}],
    "9a.m.": [{"F": "9"}, {"F": "a.m."}],
    "10a.m.": [{"F": "10"}, {"F": "a.m."}],
    "11a.m.": [{"F": "11"}, {"F": "a.m."}],
    "12a.m.": [{"F": "12"}, {"F": "a.m."}],
    "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
    "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
    "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
    "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
    "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
    "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
    "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
    "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
    "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
    "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
    "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
    "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
    "p.m.": [{"F": "p.m."}],
    "1p.m.": [{"F": "1"}, {"F": "p.m."}],
    "2p.m.": [{"F": "2"}, {"F": "p.m."}],
    "3p.m.": [{"F": "3"}, {"F": "p.m."}],
    "4p.m.": [{"F": "4"}, {"F": "p.m."}],
    "5p.m.": [{"F": "5"}, {"F": "p.m."}],
    "6p.m.": [{"F": "6"}, {"F": "p.m."}],
    "7p.m.": [{"F": "7"}, {"F": "p.m."}],
    "8p.m.": [{"F": "8"}, {"F": "p.m."}],
    "9p.m.": [{"F": "9"}, {"F": "p.m."}],
    "10p.m.": [{"F": "10"}, {"F": "p.m."}],
    "11p.m.": [{"F": "11"}, {"F": "p.m."}],
    "12p.m.": [{"F": "12"}, {"F": "p.m."}],
    "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
    "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
    "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
    "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
    "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
    "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
    "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
    "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
    "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
    "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
    "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
    "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
    "Ala.": [{"F": "Ala."}],
    "Ariz.": [{"F": "Ariz."}],
    "Ark.": [{"F":  "Ark."}],
    "Calif.": [{"F": "Calif."}],
    "Colo.": [{"F": "Colo."}],
    "Conn.": [{"F": "Conn."}],
    "Del.": [{"F":  "Del."}],
    "D.C.": [{"F": "D.C."}],
    "Fla.": [{"F":  "Fla."}],
    "Ga.": [{"F": "Ga."}],
    "Ill.": [{"F": "Ill."}],
    "Ind.": [{"F": "Ind."}],
    "Kans.": [{"F": "Kans."}],
    "Kan.": [{"F": "Kan."}],
    "Ky.": [{"F": "Ky."}],
    "La.": [{"F": "La."}],
    "Md.": [{"F": "Md."}],
    "Mass.": [{"F": "Mass."}],
    "Mich.": [{"F": "Mich."}],
    "Minn.": [{"F": "Minn."}],
    "Miss.": [{"F": "Miss."}],
    "Mo.": [{"F": "Mo."}],
    "Mont.": [{"F": "Mont."}],
    "Nebr.": [{"F": "Nebr."}],
    "Neb.": [{"F": "Neb."}],
    "Nev.": [{"F":  "Nev."}],
    "N.H.": [{"F": "N.H."}],
    "N.J.": [{"F": "N.J."}],
    "N.M.": [{"F": "N.M."}],
    "N.Y.": [{"F": "N.Y."}],
    "N.C.": [{"F": "N.C."}],
    "N.D.": [{"F": "N.D."}],
    "Okla.": [{"F": "Okla."}],
    "Ore.": [{"F": "Ore."}],
    "Pa.": [{"F": "Pa."}],
    "Tenn.": [{"F": "Tenn."}],
    "Va.": [{"F": "Va."}],
    "Wash.": [{"F": "Wash."}],
    "Wis.": [{"F": "Wis."}],
    ":)":  [{"F": ":)"}],
    "<3":  [{"F": "<3"}],
    ";)":  [{"F": ";)"}],
    "(:":  [{"F": "(:"}],
    ":(":  [{"F": ":("}],
    "-_-": [{"F": "-_-"}],
    "=)":  [{"F": "=)"}],
    ":/":  [{"F": ":/"}],
    ":>":  [{"F": ":>"}],
    ";-)": [{"F": ";-)"}],
    ":Y":  [{"F": ":Y"}],
    ":P":  [{"F": ":P"}],
    ":-P": [{"F": ":-P"}],
    ":3":  [{"F": ":3"}],
    "=3":  [{"F": "=3"}],
    "xD":  [{"F": "xD"}],
    "^_^": [{"F": "^_^"}],
    "=]":  [{"F": "=]"}],
    "=D":  [{"F": "=D"}],
    "<333":    [{"F": "<333"}],
    ":))": [{"F": ":))"}],
    ":0":  [{"F": ":0"}],
    "-__-":    [{"F": "-__-"}],
    "xDD": [{"F": "xDD"}],
    "o_o": [{"F": "o_o"}],
    "o_O": [{"F": "o_O"}],
    "V_V": [{"F": "V_V"}],
    "=[[": [{"F": "=[["}],
    "<33": [{"F": "<33"}],
    ";p":  [{"F": ";p"}],
    ";D":  [{"F": ";D"}],
    ";-p": [{"F": ";-p"}],
    ";(":  [{"F": ";("}],
    ":p":  [{"F": ":p"}],
    ":]":  [{"F": ":]"}],
    ":O":  [{"F": ":O"}],
    ":-/": [{"F": ":-/"}],
    ":-)": [{"F": ":-)"}],
    ":(((":    [{"F": ":((("}],
    ":((": [{"F": ":(("}],
    ":')": [{"F": ":')"}],
    "(^_^)":   [{"F": "(^_^)"}],
    "(=":  [{"F": "(="}],
    "o.O": [{"F": "o.O"}],
    "\")": [{"F": "\")"}],
    "a.": [{"F": "a."}],
    "b.": [{"F": "b."}],
    "c.": [{"F": "c."}],
    "d.": [{"F": "d."}],
    "e.": [{"F": "e."}],
    "f.": [{"F": "f."}],
    "g.": [{"F": "g."}],
    "h.": [{"F": "h."}],
    "i.": [{"F": "i."}],
    "j.": [{"F": "j."}],
    "k.": [{"F": "k."}],
    "l.": [{"F": "l."}],
    "m.": [{"F": "m."}],
    "n.": [{"F": "n."}],
    "o.": [{"F": "o."}],
    "p.": [{"F": "p."}],
    "q.": [{"F": "q."}],
    "r.": [{"F": "r."}],
    "s.": [{"F": "s."}],
    "t.": [{"F": "t."}],
    "u.": [{"F": "u."}],
    "v.": [{"F": "v."}],
    "w.": [{"F": "w."}],
    "x.": [{"F": "x."}],
    "y.": [{"F": "y."}],
    "z.": [{"F": "z."}],
 }
 TAG_MAP = {
 "$(": {"pos": "PUNCT", "PunctType": "Brck"},
 "$,": {"pos": "PUNCT", "PunctType": "Comm"},
 "$.": {"pos": "PUNCT", "PunctType": "Peri"},
 "ADJA":	{"pos": "ADJ"},
 "ADJD":	{"pos": "ADJ", "Variant": "Short"},
 "ADV":	{"pos": "ADV"},
 "APPO":	{"pos": "ADP", "AdpType": "Post"},
 "APPR":	{"pos": "ADP", "AdpType": "Prep"},
 "APPRART":	{"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
 "APZR":	{"pos": "ADP", "AdpType": "Circ"},
 "ART":	{"pos": "DET", "PronType": "Art"},
 "CARD":	{"pos": "NUM", "NumType": "Card"},
 "FM":	{"pos": "X", "Foreign": "Yes"},
 "ITJ":	{"pos": "INTJ"},
 "KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
 "KON": {"pos": "CONJ"},
 "KOUI":	{"pos": "SCONJ"},
 "KOUS":	{"pos": "SCONJ"},
 "NE": {"pos": "PROPN"},
 "NNE": {"pos": "PROPN"},
 "NN": {"pos": "NOUN"},
 "PAV": {"pos": "ADV", "PronType": "Dem"},
 "PROAV": {"pos": "ADV", "PronType": "Dem"},
 "PDAT":	{"pos": "DET", "PronType": "Dem"},
 "PDS": {"pos": "PRON", "PronType": "Dem"},
 "PIAT":	{"pos": "DET", "PronType": "Ind,Neg,Tot"},
 "PIDAT":	{"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
 "PIS":	{"pos": "PRON", "PronType": "Ind,Neg,Tot"},
 "PPER":	{"pos": "PRON", "PronType": "Prs"},
 "PPOSAT":	{"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
 "PPOSS":	{"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
 "PRELAT":	{"pos": "DET", "PronType": "Rel"},
 "PRELS":	{"pos": "PRON", "PronType": "Rel"},
 "PRF":	{"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
 "PTKA":	{"pos": "PART"},
 "PTKANT":	{"pos": "PART", "PartType": "Res"},
 "PTKNEG":	{"pos": "PART", "Negative": "Neg"},
 "PTKVZ":	{"pos": "PART", "PartType": "Vbp"},
 "PTKZU":	{"pos": "PART", "PartType": "Inf"},
 "PWAT":	{"pos": "DET", "PronType": "Int"},
 "PWAV":	{"pos": "ADV", "PronType": "Int"},
 "PWS":	{"pos": "PRON", "PronType": "Int"},
 "TRUNC":	{"pos": "X", "Hyph": "Yes"},
 "VAFIN":	{"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
 "VAIMP":	{"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
 "VAINF":	{"pos": "AUX", "VerbForm": "Inf"},
 "VAPP":	{"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
 "VMFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
 "VMINF":	{"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
 "VMPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
 "VVFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
 "VVIMP":	{"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
 "VVINF":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVIZU":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
 "XY":	{"pos": "X"},
 "SP": {"pos": "SPACE"}
 }
--- a/spacy/it/stop_words.py
+++ b/spacy/it/stop_words.py
@ -0,0 +1,85 @@
 # encoding: utf8
 from __future__ import unicode_literals
 STOP_WORDS = set("""
 a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl
 agli ahime ahimè ai al alcuna alcuni alcuno all alla alle allo allora altri
 altrimenti altro altrove altrui anche ancora anni anno ansa anticipo assai
 attesa attraverso avanti avemmo avendo avente aver avere averlo avesse
 avessero avessi avessimo aveste avesti avete aveva avevamo avevano avevate
 avevi avevo avrai avranno avrebbe avrebbero avrei avremmo avremo avreste
 avresti avrete avrà avrò avuta avute avuti avuto
 basta bene benissimo brava bravo
 casa caso cento certa certe certi certo che chi chicchessia chiunque ci
 ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto
 cogli coi col colei coll coloro colui come cominci comunque con concernente
 conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui
 da dagl dagli dai dal dall dalla dalle dallo dappertutto davanti degl degli
 dei del dell della delle dello dentro detto deve di dice dietro dire
 dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due
 dunque durante
 ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
 erano eravamo eravate eri ero esempio esse essendo esser essere essi ex
 fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero
 facessi facessimo faceste facesti faceva facevamo facevano facevate facevi
 facevo fai fanno farai faranno fare farebbe farebbero farei faremmo faremo
 fareste faresti farete farà farò fatto favore fece fecero feci fin finalmente
 finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra
 frattempo fu fui fummo fuori furono futuro generale
 gia già giacche giorni giorno gli gliela gliele glieli glielo gliene governo
 grande grazie gruppo
 ha haha hai hanno ho
 ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io
 la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo
 ma macche magari maggior mai male malgrado malissimo mancanza marche me
 medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi
 milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto
 nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun
 nessuna nessuno niente no noi non nondimeno nonostante nonsia nostra nostre
 nostri nostro novanta nove nulla nuovo
 od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto
 paese parecchi parecchie parecchio parte partendo peccato peggio per perche
 perché percio perciò perfino pero persino persone però piedi pieno piglia piu
 piuttosto più po pochissimo poco poi poiche possa possedere posteriore posto
 potrebbe preferibilmente presa press prima primo principalmente probabilmente
 proprio puo può pure purtroppo
 qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante
 quanti quanto quantunque quasi quattro quel quella quelle quelli quello quest
 questa queste questi questo qui quindi
 realmente recente recentemente registrazione relativo riecco salvo
 sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
 saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei
 sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate
 siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando
 stanno starai staranno starebbe starebbero starei staremmo staremo stareste
 staresti starete starà starò stata state stati stato stava stavamo stavano
 stavate stavi stavo stemmo stessa stesse stessero stessi stessimo stesso
 steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
 subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
 sullo suo suoi
 tale tali talvolta tanto te tempo ti titolo torino tra tranne tre trenta
 troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
 uguali ulteriore ultimo un una uno uomo
 va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
 vostra vostre vostri vostro
 """.split())
--- a/spacy/language.py
+++ b/spacy/language.py
@ -21,6 +21,7 @@ from .matcher import Matcher
 from . import attrs
 from . import orth
 from . import util
 from . import language_data
 from .lemmatizer import Lemmatizer
 from .train import Trainer
@ -38,7 +39,7 @@ class BaseDefaults(object):
        if nlp is None or nlp.path is None:
            return Lemmatizer({}, {}, {})
        else:
-            return Lemmatizer.load(nlp.path)
+            return Lemmatizer.load(nlp.path, rules=cls.lemma_rules)
    @classmethod
    def create_vocab(cls, nlp=None):
@ -141,13 +142,13 @@ class BaseDefaults(object):
            pipeline.append(nlp.entity)
        return pipeline
-    prefixes = tuple()
+    prefixes = tuple(language_data.TOKENIZER_PREFIXES)
-    suffixes = tuple()
+    suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
-    infixes = tuple()
+    infixes = tuple(language_data.TOKENIZER_INFIXES)
-    tag_map = {}
+    tag_map = dict(language_data.TAG_MAP)
    tokenizer_exceptions = {}
@ -159,6 +160,8 @@ class BaseDefaults(object):
    stop_words = set()
    lemma_rules = {}
    lex_attr_getters = {
        attrs.LOWER: lambda string: string.lower(),
        attrs.NORM: lambda string: string,
--- a/spacy/language_data/init.py
+++ b/spacy/language_data/init.py
@ -0,0 +1,5 @@
 from .emoticons import *
 from .punctuation import *
 from .tag_map import *
 from .entity_rules import *
 from .util import *
--- a/spacy/language_data/emoticons.py
+++ b/spacy/language_data/emoticons.py
@ -0,0 +1,146 @@
 # encoding: utf8
 from __future__ import unicode_literals
 EMOTICONS = set("""
 :)
 :-)
 :))
 :-))
 :)))
 :-)))
 (:
 (-:
 =)
 (=
 :]
 :-]
 [:
 [-:
 :o)
 (o:
 :}
 :-}
 8)
 8-)
 (-8
 ;)
 ;-)
 (;
 (-;
 :(
 :-(
 :((
 :-((
 :(((
 :-(((
 ):
 )-:
 =(
 >:(
 :')
 :'-)
 :'(
 :'-(
 :/
 :-/
 =/
 :|
 :-|
 :1
 :P
 :-P
 :p
 :-p
 :O
 :-O
 :o
 :-o
 :0
 :-0
 :()
 >:o
 :*
 :-*
 :3
 :-3
 =3
 :>
 :->
 :X
 :-X
 :x
 :-x
 :D
 :-D
 ;D
 ;-D
 =D
 xD
 XD
 xDD
 XDD
 8D
 8-D
 ^_^
 ^__^
 ^___^
 >.<
 >.>
 <.<
 ._.
 ;_;
 -_-
 -__-
 v.v
 V.V
 v_v
 V_V
 o_o
 o_O
 O_o
 O_O
 0_o
 o_0
 0_0
 o.O
 O.o
 O.O
 o.o
 0.0
 o.0
 0.o
@_@
 <3
 <33
 <333
 </3
 (^_^)
 (-_-)
 (._.)
 (>_<)
 (*_*)
 (¬_¬)
 ಠ_ಠ
 ಠ︵ಠ
 (ಠ_ಠ)
 ¯\(ツ)/¯
 (╯°□°）╯︵┻━┻
 ><(((*>
 """.split())
 __all__ = [ "EMOTICONS" ]
--- a/spacy/language_data/entity_rules.py
+++ b/spacy/language_data/entity_rules.py
@ -0,0 +1,206 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from ..symbols import *
 from .util import ENT_ID
 ENTITY_RULES = [
    {
        ENT_ID: "Reddit",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{LOWER: "reddit"}]
        ]
    },
    {
        ENT_ID: "Linux",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{LOWER: "linux"}]
        ]
    },
    {
        ENT_ID: "Haskell",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{LOWER: "haskell"}],
        ]
    },
    {
        ENT_ID: "HaskellCurry",
        "attrs": {ENT_TYPE: "PERSON"},
        "patterns": [
            [{LOWER: "haskell"}, {LOWER: "curry"}]
        ]
    },
    {
        ENT_ID: "Javascript",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{LOWER: "javascript"}],
        ]
    },
    {
        ENT_ID: "CSS",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{LOWER: "css"}],
            [{LOWER: "css3"}],
        ]
    },
    {
        ENT_ID: "HTML",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{LOWER: "html"}],
            [{LOWER: "html5"}],
        ]
    },
    {
        ENT_ID: "Python",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{ORTH: "Python"}]
        ]
    },
    {
        ENT_ID: "Ruby",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{ORTH: "Ruby"}]
        ]
    },
    {
        ENT_ID: "spaCy",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{LOWER: "spacy"}]
        ]
    },
    {
        ENT_ID: "displaCy",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{LOWER: "displacy"}]
        ]
    },
    {
        ENT_ID: "Digg",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{LOWER: "digg"}]
        ]
    },
    {
        ENT_ID: "FoxNews",
        "attrs": {ENT_TYPE: "ORG"},
        "patterns": [
            [{LOWER: "foxnews"}],
            [{LOWER: "fox"}, {LOWER: "news"}]
        ]
    },
    {
        ENT_ID: "Google",
        "attrs": {ENT_TYPE: "ORG"},
        "patterns": [
            [{LOWER: "google"}]
        ]
    },
    {
        ENT_ID: "Mac",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{LOWER: "mac"}]
        ]
    },
    {
        ENT_ID: "Wikipedia",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{LOWER: "wikipedia"}]
        ]
    },
    {
        ENT_ID: "Windows",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{ORTH: "Windows"}]
        ]
    },
    {
        ENT_ID: "Dell",
        "attrs": {ENT_TYPE: "ORG"},
        "patterns": [
            [{LOWER: "dell"}]
        ]
    },
    {
        ENT_ID: "Facebook",
        "attrs": {ENT_TYPE: "ORG"},
        "patterns": [
            [{LOWER: "facebook"}]
        ]
    },
    {
        ENT_ID: "Blizzard",
        "attrs": {ENT_TYPE: "ORG"},
        "patterns": [
            [{ORTH: "Blizzard"}]
        ]
    },
    {
        ENT_ID: "Ubuntu",
        "attrs": {ENT_TYPE: "ORG"},
        "patterns": [
            [{ORTH: "Ubuntu"}]
        ]
    },
    {
        ENT_ID: "YouTube",
        "attrs": {ENT_TYPE: "PRODUCT"},
        "patterns": [
            [{LOWER: "youtube"}]
        ]
    }
 ]
 FALSE_POSITIVES = [
    [{ORTH: "Shit"}],
    [{ORTH: "Weed"}],
    [{ORTH: "Cool"}],
    [{ORTH: "Btw"}],
    [{ORTH: "Bah"}],
    [{ORTH: "Bullshit"}],
    [{ORTH: "Lol"}],
    [{ORTH: "Yo"}, {LOWER: "dawg"}],
    [{ORTH: "Yay"}],
    [{ORTH: "Ahh"}],
    [{ORTH: "Yea"}],
    [{ORTH: "Bah"}]
 ]
 __all__ = ["ENTITY_RULES", "FALSE_POSITIVES"]
--- a/spacy/language_data/punctuation.py
+++ b/spacy/language_data/punctuation.py
@ -0,0 +1,133 @@
 # encoding: utf8
 from __future__ import unicode_literals
 TOKENIZER_PREFIXES = r'''
 ,
 "
 (
 [
 {
 *
 <
 >
 $
 £
 ¡
 ¿
 „
 “
 '
 ``
 `
 #
 ‘
 ....
 ...
 …
 ‚
 »
 §
 US$
 C$
 A$
 a-
 '''.strip().split('\n')
 TOKENIZER_SUFFIXES = r'''
 ,
 \"
 \)
 \]
 \}
 \*
 \!
 \?
 %
 \$
 >
 :
 ;
 '
 ”
 “
 «
 _
 ''
 's
 'S
 ’s
 ’S
 ’
 ‘
 °
 €
 …
 \.\.
 \.\.\.
 \.\.\.\.
 (?<=[a-z0-9)\]”"'%\)])\.
 (?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
 \-\-
 ´
 (?<=[0-9])km²
 (?<=[0-9])m²
 (?<=[0-9])cm²
 (?<=[0-9])mm²
 (?<=[0-9])km³
 (?<=[0-9])m³
 (?<=[0-9])cm³
 (?<=[0-9])mm³
 (?<=[0-9])ha
 (?<=[0-9])km
 (?<=[0-9])m
 (?<=[0-9])cm
 (?<=[0-9])mm
 (?<=[0-9])µm
 (?<=[0-9])nm
 (?<=[0-9])yd
 (?<=[0-9])in
 (?<=[0-9])ft
 (?<=[0-9])kg
 (?<=[0-9])g
 (?<=[0-9])mg
 (?<=[0-9])µg
 (?<=[0-9])t
 (?<=[0-9])lb
 (?<=[0-9])oz
 (?<=[0-9])m/s
 (?<=[0-9])km/h
 (?<=[0-9])mph
 (?<=[0-9])°C
 (?<=[0-9])°K
 (?<=[0-9])°F
 (?<=[0-9])hPa
 (?<=[0-9])Pa
 (?<=[0-9])mbar
 (?<=[0-9])mb
 (?<=[0-9])T
 (?<=[0-9])G
 (?<=[0-9])M
 (?<=[0-9])K
 (?<=[0-9])kb
 '''.strip().split('\n')
 TOKENIZER_INFIXES = r'''
 …
 \.\.\.+
 (?<=[a-z])\.(?=[A-Z])
 (?<=[a-z])\.(?=[A-Z])
 (?<=[a-zA-Z])-(?=[a-zA-z])
 (?<=[a-zA-Z])--(?=[a-zA-z])
 (?<=[0-9])-(?=[0-9])
 (?<=[A-Za-z]),(?=[A-Za-z])
 (?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
 (?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
 (?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
 (?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
 '''.strip().split('\n')
 __all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
--- a/spacy/language_data/tag_map.py
+++ b/spacy/language_data/tag_map.py
@ -0,0 +1,24 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from ..symbols import *
 TAG_MAP = {
    "ADV":      {POS: ADV},
    "NOUN":     {POS: NOUN},
    "ADP":      {POS: ADP},
    "PRON":     {POS: PRON},
    "SCONJ":    {POS: SCONJ},
    "PROPN":    {POS: PROPN},
    "DET":      {POS: DET},
    "SYM":      {POS: SYM},
    "INTJ":     {POS: INTJ},
    "PUNCT":    {POS: PUNCT},
    "NUM":      {POS: NUM},
    "AUX":      {POS: AUX},
    "X":        {POS: X},
    "CONJ":     {POS: CONJ},
    "ADJ":      {POS: ADJ},
    "VERB":     {POS: VERB}
 }
--- a/spacy/language_data/util.py
+++ b/spacy/language_data/util.py
@ -0,0 +1,37 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from ..symbols import *
 PRON_LEMMA = "-PRON-"
 ENT_ID = "ent_id"
 def update_exc(exc, additions):
    overlap = set(exc.keys()).intersection(set(additions))
    assert not overlap, overlap
    exc.update(additions)
 def strings_to_exc(orths):
    return {orth: [{ORTH: orth}] for orth in orths}
 def expand_exc(excs, search, replace):
    updates = {}
    for token_string, tokens in excs.items():
        if search in token_string:
            new_key = token_string.replace(search, replace)
            new_value = [_fix_token(t, search, replace) for t in tokens]
            updates[new_key] = new_value
    return updates
 def _fix_token(token, search, replace):
    fixed = dict(token)
    fixed[ORTH] = fixed[ORTH].replace(search, replace)
    return fixed
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -4,12 +4,12 @@ import pathlib
 import ujson as json
-from .symbols import NOUN, VERB, ADJ, PUNCT
+from .symbols import POS, NOUN, VERB, ADJ, PUNCT
 class Lemmatizer(object):
    @classmethod
-    def load(cls, path):
+    def load(cls, path, rules=None):
        index = {}
        exc = {}
        for pos in ['adj', 'noun', 'verb']:
@ -25,8 +25,11 @@ class Lemmatizer(object):
                    exc[pos] = read_exc(file_)
            else:
                exc[pos] = {}
        if rules is None and (path / 'vocab' / 'lemma_rules.json').exists():
            with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
                rules = json.load(file_)
        elif rules is None:
            rules = {}
        return cls(index, exc, rules)
    def __init__(self, index, exceptions, rules):
@ -34,7 +37,7 @@ class Lemmatizer(object):
        self.exc = exceptions
        self.rules = rules
-    def __call__(self, string, univ_pos, **morphology):
+    def __call__(self, string, univ_pos, morphology=None):
        if univ_pos == NOUN:
            univ_pos = 'noun'
        elif univ_pos == VERB:
@ -44,17 +47,18 @@ class Lemmatizer(object):
        elif univ_pos == PUNCT:
            univ_pos = 'punct'
        # See Issue #435 for example of where this logic is requied.
-        if self.is_base_form(univ_pos, **morphology):
+        if self.is_base_form(univ_pos, morphology):
            return set([string.lower()])
        lemmas = lemmatize(string, self.index.get(univ_pos, {}),
                           self.exc.get(univ_pos, {}),
                           self.rules.get(univ_pos, []))
        return lemmas
-    def is_base_form(self, univ_pos, **morphology):
+    def is_base_form(self, univ_pos, morphology=None):
        '''Check whether we're dealing with an uninflected paradigm, so we can
        avoid lemmatization entirely.'''
-        others = [key for key in morphology if key not in ('number', 'pos', 'verbform')]
+        morphology = {} if morphology is None else morphology
        others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
        if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
            return True
        elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
@ -62,17 +66,17 @@ class Lemmatizer(object):
        else:
            return False
-    def noun(self, string, **morphology):
+    def noun(self, string, morphology=None):
-        return self(string, 'noun', **morphology)
+        return self(string, 'noun', morphology)
-    def verb(self, string, **morphology):
+    def verb(self, string, morphology=None):
-        return self(string, 'verb', **morphology)
+        return self(string, 'verb', morphology)
-    def adj(self, string, **morphology):
+    def adj(self, string, morphology=None):
-        return self(string, 'adj', **morphology)
+        return self(string, 'adj', morphology)
-    def punct(self, string, **morphology):
+    def punct(self, string, morphology=None):
-        return self(string, 'punct', **morphology)
+        return self(string, 'punct', morphology)
 def lemmatize(string, index, exceptions, rules):
--- a/Show More
+++ b/Show More
		`@ -1,3 +0,0 @@`
			`Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern.`

			Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs.
`@ -87,5 +87,3 @@ cpdef enum attr_id_t:`
	`PROB`	`PROB`

	`LANG`	`LANG`