* Merge in changes from de branch

2025-07-17 19:52:18 +03:00 · 2015-09-06 19:49:28 +02:00 · 2015-09-06 19:49:28 +02:00 · 86c888667f
commit 86c888667f
parent d2fc104a26 b3703836f9
50 changed files with 2351 additions and 973 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -20,6 +20,7 @@ from __future__ import unicode_literals
 from ast import literal_eval
 import math
 import gzip
+import json

 import plac
 from pathlib import Path
@ -29,8 +30,6 @@ from shutil import copytree
 import codecs
 from collections import defaultdict

-from spacy.en import get_lex_props
-from spacy.en.lemmatizer import Lemmatizer
 from spacy.vocab import Vocab
 from spacy.vocab import write_binary_vectors
 from spacy.strings import hash_string
@ -38,6 +37,13 @@ from preshed.counter import PreshCounter

 from spacy.parts_of_speech import NOUN, VERB, ADJ

+import spacy.en
+import spacy.de
+import spacy.fi
+import spacy.it
+
+
+

 def setup_tokenizer(lang_data_dir, tok_dir):
    if not tok_dir.exists():
@ -139,7 +145,7 @@ def _read_senses(loc):
    return lexicon


-def setup_vocab(src_dir, dst_dir):
+def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
    if not dst_dir.exists():
        dst_dir.mkdir()

@ -148,13 +154,13 @@ def setup_vocab(src_dir, dst_dir):
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    else:
        print("Warning: Word vectors file not found")
-    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
+    vocab = Vocab(get_lex_attr=get_lex_attr, tag_map=tag_map)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
    if not probs:
        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
    if not probs:
-        oov_prob = 0.0
+        oov_prob = -20
    else:
        oov_prob = min(probs.values())
    for word in clusters:
@ -163,23 +169,32 @@ def setup_vocab(src_dir, dst_dir):

    lexicon = []
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
-        entry = get_lex_props(word)
-        entry['prob'] = float(prob)
-        cluster = clusters.get(word, '0')
+        lexeme = vocab[word]
+        lexeme.prob = prob
+        lexeme.is_oov = False
        # Decode as a little-endian string, so that we can do & 15 to get
        # the first 4 bits. See _parse_features.pyx
-        entry['cluster'] = int(cluster[::-1], 2)
-        vocab[word] = entry
+        if word in clusters:
+            lexeme.cluster = int(clusters[word][::-1], 2)
+        else:
+            lexeme.cluster = 0
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))
    with (dst_dir / 'oov_prob').open('w') as file_:
        file_.write('%f' % oov_prob)


-def main(lang_data_dir, corpora_dir, model_dir):
+def main(lang_id, lang_data_dir, corpora_dir, model_dir):
+    languages = {
+        'en': spacy.en.English.default_lex_attrs(),
+        'de': spacy.de.Deutsch.default_lex_attrs(),
+        'fi': spacy.fi.Finnish.default_lex_attrs(),
+        'it': spacy.it.Italian.default_lex_attrs(),
+    }
+
    model_dir = Path(model_dir)
-    lang_data_dir = Path(lang_data_dir)
-    corpora_dir = Path(corpora_dir)
+    lang_data_dir = Path(lang_data_dir) / lang_id
+    corpora_dir = Path(corpora_dir) / lang_id

    assert corpora_dir.exists()
    assert lang_data_dir.exists()
@ -187,13 +202,19 @@ def main(lang_data_dir, corpora_dir, model_dir):
    if not model_dir.exists():
        model_dir.mkdir()

+    tag_map = json.load((lang_data_dir / 'tag_map.json').open())
    setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
-    setup_vocab(corpora_dir, model_dir / 'vocab')
+    setup_vocab(languages[lang_id], tag_map, corpora_dir, model_dir / 'vocab')

    if (lang_data_dir / 'gazetteer.json').exists():
        copyfile(str(lang_data_dir / 'gazetteer.json'),
                 str(model_dir / 'vocab' / 'gazetteer.json'))
-    if not (model_dir / 'wordnet').exists():
+
+    if (lang_data_dir / 'lemma_rules.json').exists():
+        copyfile(str(lang_data_dir / 'lemma_rules.json'),
+                 str(model_dir / 'vocab' / 'lemma_rules.json'))
+
+    if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists():
        copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))


--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@ -14,7 +14,6 @@ import re

 import spacy.util
 from spacy.en import English
-from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir

 from spacy.syntax.util import Config
 from spacy.gold import read_json_file
@ -22,6 +21,11 @@ from spacy.gold import GoldParse

 from spacy.scorer import Scorer

+from spacy.syntax.arc_eager import ArcEager
+from spacy.syntax.ner import BiluoPushDown
+from spacy.tagger import Tagger
+from spacy.syntax.parser import Parser
+

 def _corrupt(c, noise_level):
    if random.random() >= noise_level:
@ -80,32 +84,28 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          beam_width=1, verbose=False,
          use_orig_arc_eager=False):
    dep_model_dir = path.join(model_dir, 'deps')
-    pos_model_dir = path.join(model_dir, 'pos')
    ner_model_dir = path.join(model_dir, 'ner')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
-    if path.exists(pos_model_dir):
-        shutil.rmtree(pos_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    os.mkdir(dep_model_dir)
-    os.mkdir(pos_model_dir)
    os.mkdir(ner_model_dir)

-    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
-
    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
-                 labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
+                 labels=ArcEager.get_labels(gold_tuples),
                 beam_width=beam_width)
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
-                 labels=Language.EntityTransitionSystem.get_labels(gold_tuples),
+                 labels=BiluoPushDown.get_labels(gold_tuples),
                 beam_width=0)

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]

-    nlp = Language(data_dir=model_dir)
-
+    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
+    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
+    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
+    nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
@ -140,7 +140,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                                   scorer.tags_acc,
                                                   scorer.token_acc))
-    nlp.end_training()
+    nlp.end_training(model_dir)

 def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
             beam_width=None):
--- a/bin/tagger/train.py
+++ b/bin/tagger/train.py
@ -0,0 +1,175 @@
+#!/usr/bin/env python
+from __future__ import division
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import os
+from os import path
+import shutil
+import codecs
+import random
+
+import plac
+import re
+
+import spacy.util
+from spacy.en import English
+
+from spacy.tagger import Tagger
+
+from spacy.syntax.util import Config
+from spacy.gold import read_json_file
+from spacy.gold import GoldParse
+
+from spacy.scorer import Scorer
+
+
+def score_model(scorer, nlp, raw_text, annot_tuples):
+    if raw_text is None:
+        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+    else:
+        tokens = nlp.tokenizer(raw_text)
+    nlp.tagger(tokens)
+    gold = GoldParse(tokens, annot_tuples)
+    scorer.score(tokens, gold)
+
+
+def _merge_sents(sents):
+    m_deps = [[], [], [], [], [], []]
+    m_brackets = []
+    i = 0
+    for (ids, words, tags, heads, labels, ner), brackets in sents:
+        m_deps[0].extend(id_ + i for id_ in ids)
+        m_deps[1].extend(words)
+        m_deps[2].extend(tags)
+        m_deps[3].extend(head + i for head in heads)
+        m_deps[4].extend(labels)
+        m_deps[5].extend(ner)
+        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
+        i += len(ids)
+    return [(m_deps, m_brackets)]
+
+
+def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
+          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
+          beam_width=1, verbose=False,
+          use_orig_arc_eager=False):
+    if n_sents > 0:
+        gold_tuples = gold_tuples[:n_sents]
+   
+    templates = Tagger.default_templates()
+    nlp = Language(data_dir=model_dir, tagger=False)
+    nlp.tagger = Tagger.blank(nlp.vocab, templates)
+
+    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
+    for itn in range(n_iter):
+        scorer = Scorer()
+        loss = 0
+        for raw_text, sents in gold_tuples:
+            if gold_preproc:
+                raw_text = None
+            else:
+                sents = _merge_sents(sents)
+            for annot_tuples, ctnt in sents:
+                words = annot_tuples[1]
+                gold_tags = annot_tuples[2]
+                score_model(scorer, nlp, raw_text, annot_tuples)
+                if raw_text is None:
+                    tokens = nlp.tokenizer.tokens_from_list(words)
+                else:
+                    tokens = nlp.tokenizer(raw_text)
+                loss += nlp.tagger.train(tokens, gold_tags)
+        random.shuffle(gold_tuples)
+        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
+                                                   scorer.tags_acc,
+                                                   scorer.token_acc))
+    nlp.end_training(model_dir)
+
+def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
+             beam_width=None):
+    nlp = Language(data_dir=model_dir)
+    if beam_width is not None:
+        nlp.parser.cfg.beam_width = beam_width
+    scorer = Scorer()
+    for raw_text, sents in gold_tuples:
+        if gold_preproc:
+            raw_text = None
+        else:
+            sents = _merge_sents(sents)
+        for annot_tuples, brackets in sents:
+            if raw_text is None:
+                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+                nlp.tagger(tokens)
+                nlp.entity(tokens)
+                nlp.parser(tokens)
+            else:
+                tokens = nlp(raw_text, merge_mwes=False)
+            gold = GoldParse(tokens, annot_tuples)
+            scorer.score(tokens, gold, verbose=verbose)
+    return scorer
+
+
+def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
+    nlp = Language(data_dir=model_dir)
+    if beam_width is not None:
+        nlp.parser.cfg.beam_width = beam_width
+    gold_tuples = read_json_file(dev_loc)
+    scorer = Scorer()
+    out_file = codecs.open(out_loc, 'w', 'utf8')
+    for raw_text, sents in gold_tuples:
+        sents = _merge_sents(sents)
+        for annot_tuples, brackets in sents:
+            if raw_text is None:
+                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+                nlp.tagger(tokens)
+                nlp.entity(tokens)
+                nlp.parser(tokens)
+            else:
+                tokens = nlp(raw_text, merge_mwes=False)
+            gold = GoldParse(tokens, annot_tuples)
+            scorer.score(tokens, gold, verbose=False)
+            for t in tokens:
+                out_file.write(
+                    '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
+                )
+    return scorer
+
+
+@plac.annotations(
+    train_loc=("Location of training file or directory"),
+    dev_loc=("Location of development file or directory"),
+    model_dir=("Location of output model directory",),
+    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
+    corruption_level=("Amount of noise to add to training data", "option", "c", float),
+    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
+    out_loc=("Out location", "option", "o", str),
+    n_sents=("Number of training sentences", "option", "n", int),
+    n_iter=("Number of training iterations", "option", "i", int),
+    verbose=("Verbose error reporting", "flag", "v", bool),
+    debug=("Debug mode", "flag", "d", bool),
+)
+def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
+         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
+    if not eval_only:
+        gold_train = list(read_json_file(train_loc))
+        train(English, gold_train, model_dir,
+              feat_set='basic' if not debug else 'debug',
+              gold_preproc=gold_preproc, n_sents=n_sents,
+              corruption_level=corruption_level, n_iter=n_iter,
+              verbose=verbose)
+    #if out_loc:
+    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
+    scorer = evaluate(English, list(read_json_file(dev_loc)),
+                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
+    print('TOK', scorer.token_acc)
+    print('POS', scorer.tags_acc)
+    print('UAS', scorer.uas)
+    print('LAS', scorer.las)
+
+    print('NER P', scorer.ents_p)
+    print('NER R', scorer.ents_r)
+    print('NER F', scorer.ents_f)
+
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/lang_data/de/infix.txt
+++ b/lang_data/de/infix.txt
@ -0,0 +1,3 @@
+\.\.\.
+(?<=[a-z])\.(?=[A-Z])
+(?<=[a-zA-Z])-(?=[a-zA-z])
--- a/lang_data/de/lemma_rules.json
+++ b/lang_data/de/lemma_rules.json
--- a/lang_data/de/morphs.json
+++ b/lang_data/de/morphs.json
--- a/lang_data/de/prefix.txt
+++ b/lang_data/de/prefix.txt
@ -0,0 +1,21 @@
+,
+"
+(
+[
+{
+*
+<
+$
+£
+“
+'
+``
+`
+#
+US$
+C$
+A$
+a-
+‘
+....
+...
--- a/lang_data/de/sample.txt
+++ b/lang_data/de/sample.txt
@ -0,0 +1,3 @@
+Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern.
+
+Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs.
--- a/lang_data/de/specials.json
+++ b/lang_data/de/specials.json
@ -0,0 +1,149 @@
+{
+"a.m.": [{"F": "a.m."}],
+"p.m.": [{"F": "p.m."}],
+
+"1a.m.": [{"F": "1"}, {"F": "a.m."}],
+"2a.m.": [{"F": "2"}, {"F": "a.m."}],
+"3a.m.": [{"F": "3"}, {"F": "a.m."}],
+"4a.m.": [{"F": "4"}, {"F": "a.m."}],
+"5a.m.": [{"F": "5"}, {"F": "a.m."}],
+"6a.m.": [{"F": "6"}, {"F": "a.m."}],
+"7a.m.": [{"F": "7"}, {"F": "a.m."}],
+"8a.m.": [{"F": "8"}, {"F": "a.m."}],
+"9a.m.": [{"F": "9"}, {"F": "a.m."}],
+"10a.m.": [{"F": "10"}, {"F": "a.m."}],
+"11a.m.": [{"F": "11"}, {"F": "a.m."}],
+"12a.m.": [{"F": "12"}, {"F": "a.m."}],
+"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
+"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
+"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
+"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
+"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
+"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
+"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
+"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
+"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
+"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
+"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
+"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
+
+
+"1p.m.": [{"F": "1"}, {"F": "p.m."}],
+"2p.m.": [{"F": "2"}, {"F": "p.m."}],
+"3p.m.": [{"F": "3"}, {"F": "p.m."}],
+"4p.m.": [{"F": "4"}, {"F": "p.m."}],
+"5p.m.": [{"F": "5"}, {"F": "p.m."}],
+"6p.m.": [{"F": "6"}, {"F": "p.m."}],
+"7p.m.": [{"F": "7"}, {"F": "p.m."}],
+"8p.m.": [{"F": "8"}, {"F": "p.m."}],
+"9p.m.": [{"F": "9"}, {"F": "p.m."}],
+"10p.m.": [{"F": "10"}, {"F": "p.m."}],
+"11p.m.": [{"F": "11"}, {"F": "p.m."}],
+"12p.m.": [{"F": "12"}, {"F": "p.m."}],
+"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
+"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
+"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
+"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
+"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
+"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
+"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
+"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
+"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
+"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
+"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
+"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
+
+"Jan.": [{"F": "Jan.", "L": "Januar"}],
+"Feb.": [{"F": "Feb.", "L": "Februar"}],
+"Mär.": [{"F": "Mär.", "L": "März"}],
+"Apr.": [{"F": "Apr.", "L": "April"}],
+"Mai.": [{"F": "Mai.", "L": "Mai"}],
+"Jun.": [{"F": "Jun.", "L": "Juni"}],
+"Jul.": [{"F": "Jul.", "L": "Juli"}],
+"Aug.": [{"F": "Aug.", "L": "August"}],
+"Sep.": [{"F": "Sep.", "L": "September"}],
+"Sept.": [{"F": "Sept.", "L": "September"}],
+"Okt.": [{"F": "Okt.", "L": "Oktober"}],
+"Nov.": [{"F": "Nov.", "L": "November"}],
+"Dez.": [{"F": "Dez.", "L": "Dezember"}],
+
+":)":  [{"F": ":)"}],
+"<3":  [{"F": "<3"}],
+";)":  [{"F": ";)"}],
+"(:":  [{"F": "(:"}],
+":(":  [{"F": ":("}],
+"-_-": [{"F": "-_-"}],
+"=)":  [{"F": "=)"}],
+":/":  [{"F": ":/"}],
+":>":  [{"F": ":>"}],
+";-)": [{"F": ";-)"}],
+":Y":  [{"F": ":Y"}],
+":P":  [{"F": ":P"}],
+":-P": [{"F": ":-P"}],
+":3":  [{"F": ":3"}],
+"=3":  [{"F": "=3"}],
+"xD":  [{"F": "xD"}],
+"^_^": [{"F": "^_^"}],
+"=]":  [{"F": "=]"}],
+"=D":  [{"F": "=D"}],
+"<333":    [{"F": "<333"}],
+":))": [{"F": ":))"}],
+":0":  [{"F": ":0"}],
+"-__-":    [{"F": "-__-"}],
+"xDD": [{"F": "xDD"}],
+"o_o": [{"F": "o_o"}],
+"o_O": [{"F": "o_O"}],
+"V_V": [{"F": "V_V"}],
+"=[[": [{"F": "=[["}],
+"<33": [{"F": "<33"}],
+";p":  [{"F": ";p"}],
+";D":  [{"F": ";D"}],
+";-p": [{"F": ";-p"}],
+";(":  [{"F": ";("}],
+":p":  [{"F": ":p"}],
+":]":  [{"F": ":]"}],
+":O":  [{"F": ":O"}],
+":-/": [{"F": ":-/"}],
+":-)": [{"F": ":-)"}],
+":(((":    [{"F": ":((("}],
+":((": [{"F": ":(("}],
+":')": [{"F": ":')"}],
+"(^_^)":   [{"F": "(^_^)"}],
+"(=":  [{"F": "(="}],
+"o.O": [{"F": "o.O"}],
+"\")": [{"F": "\")"}],
+"a.": [{"F": "a."}],
+"b.": [{"F": "b."}],
+"c.": [{"F": "c."}],
+"d.": [{"F": "d."}],
+"e.": [{"F": "e."}],
+"f.": [{"F": "f."}],
+"g.": [{"F": "g."}],
+"h.": [{"F": "h."}],
+"i.": [{"F": "i."}],
+"j.": [{"F": "j."}],
+"k.": [{"F": "k."}],
+"l.": [{"F": "l."}],
+"m.": [{"F": "m."}],
+"n.": [{"F": "n."}],
+"o.": [{"F": "o."}],
+"p.": [{"F": "p."}],
+"q.": [{"F": "q."}],
+"s.": [{"F": "s."}],
+"t.": [{"F": "t."}],
+"u.": [{"F": "u."}],
+"v.": [{"F": "v."}],
+"w.": [{"F": "w."}],
+"x.": [{"F": "x."}],
+"y.": [{"F": "y."}],
+"z.": [{"F": "z."}],
+
+"z.b.": [{"F": "z.b."}],
+"e.h.": [{"F": "I.e."}],
+"o.ä.": [{"F": "I.E."}],
+"bzw.": [{"F": "bzw."}],
+"usw.": [{"F": "usw."}],
+"\n": [{"F": "\n", "pos": "SP"}],
+"\t": [{"F": "\t", "pos": "SP"}],
+" ": [{"F": " ", "pos": "SP"}]
+}
--- a/lang_data/de/suffix.txt
+++ b/lang_data/de/suffix.txt
@ -0,0 +1,26 @@
+,
+\"
+\)
+\]
+\}
+\*
+\!
+\?
+%
+\$
+>
+:
+;
+'
+”
+''
+'s
+'S
+’s
+’S
+’
+\.\.
+\.\.\.
+\.\.\.\.
+(?<=[a-z0-9)\]"'%\)])\.
+(?<=[0-9])km
--- a/lang_data/de/tag_map.json
+++ b/lang_data/de/tag_map.json
@ -0,0 +1,56 @@
+{
+"$(": {"pos": "PUNCT", "PunctType": "Brck"},
+"$,": {"pos": "PUNCT", "PunctType": "Comm"},
+"$.": {"pos": "PUNCT", "PunctType": "Peri"},
+"ADJA":	{"pos": "ADJ"},
+"ADJD":	{"pos": "ADJ", "Variant": "Short"},
+"ADV":	{"pos": "ADV"},
+"APPO":	{"pos": "ADP", "AdpType": "Post"},
+"APPR":	{"pos": "ADP", "AdpType": "Prep"},
+"APPRART":	{"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
+"APZR":	{"pos": "ADP", "AdpType": "Circ"},
+"ART":	{"pos": "DET", "PronType": "Art"},
+"CARD":	{"pos": "NUM", "NumType": "Card"},
+"FM":	{"pos": "X", "Foreign": "Yes"},
+"ITJ":	{"pos": "INTJ"},
+"KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
+"KON": {"pos": "CONJ"},
+"KOUI":	{"pos": "SCONJ"},
+"KOUS":	{"pos": "SCONJ"},
+"NE": {"pos": "PROPN"},
+"NN": {"pos": "NOUN"},
+"PAV": {"pos": "ADV", "PronType": "Dem"},
+"PDAT":	{"pos": "DET", "PronType": "Dem"},
+"PDS": {"pos": "PRON", "PronType": "Dem"},
+"PIAT":	{"pos": "DET", "PronType": "Ind,Neg,Tot"},
+"PIDAT":	{"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
+"PIS":	{"pos": "PRON", "PronType": "Ind,Neg,Tot"},
+"PPER":	{"pos": "PRON", "PronType": "Prs"},
+"PPOSAT":	{"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
+"PPOSS":	{"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
+"PRELAT":	{"pos": "DET", "PronType": "Rel"},
+"PRELS":	{"pos": "PRON", "PronType": "Rel"},
+"PRF":	{"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
+"PTKA":	{"pos": "PART"},
+"PTKANT":	{"pos": "PART", "PartType": "Res"},
+"PTKNEG":	{"pos": "PART", "Negative": "Neg"},
+"PTKVZ":	{"pos": "PART", "PartType": "Vbp"},
+"PTKZU":	{"pos": "PART", "PartType": "Inf"},
+"PWAT":	{"pos": "DET", "PronType": "Int"},
+"PWAV":	{"pos": "ADV", "PronType": "Int"},
+"PWS":	{"pos": "PRON", "PronType": "Int"},
+"TRUNC":	{"pos": "X", "Hyph": "Yes"},
+"VAFIN":	{"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
+"VAIMP":	{"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
+"VAINF":	{"pos": "AUX", "VerbForm": "Inf"},
+"VAPP":	{"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
+"VMFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
+"VMINF":	{"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
+"VMPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
+"VVFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
+"VVIMP":	{"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
+"VVINF":	{"pos": "VERB", "VerbForm": "Inf"},
+"VVIZU":	{"pos": "VERB", "VerbForm": "Inf"},
+"VVPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
+"XY":	{"pos": "X"}
+}
--- a/lang_data/en/lemma_rules.json
+++ b/lang_data/en/lemma_rules.json
@ -0,0 +1,31 @@
+{
+    "noun": [
+        ["s", ""],
+        ["ses", "s"],
+        ["ves", "f"],
+        ["xes", "x"],
+        ["zes", "z"],
+        ["ches", "ch"],
+        ["shes", "sh"],
+        ["men", "man"],
+        ["ies", "y"]
+    ],
+
+    "verb": [
+        ["s", ""],
+        ["ies", "y"],
+        ["es", "e"],
+        ["es", ""],
+        ["ed", "e"],
+        ["ed", ""],
+        ["ing", "e"],
+        ["ing", ""]
+    ],
+
+    "adj": [
+        ["er", ""],
+        ["est", ""],
+        ["er", "e"],
+        ["est", "e"]
+    ]
+}
--- a/lang_data/en/tag_map.json
+++ b/lang_data/en/tag_map.json
@ -0,0 +1,60 @@
+{
+".": {"pos": "punct", "puncttype": "peri"},
+",": {"pos": "punct", "puncttype": "comm"},
+"-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"},
+"-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"},
+"``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"},
+"\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
+"''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
+":": {"pos": "punct"},
+"$": {"pos": "sym", "other": {"symtype": "currency"}},
+"#": {"pos": "sym", "other": {"symtype": "numbersign"}},
+"AFX": {"pos": "adj",  "hyph": "hyph"},
+"CC": {"pos": "conj", "conjtype": "coor"},
+"CD": {"pos": "num", "numtype": "card"},
+"DT": {"pos": "adj", "prontype": "prn"},
+"EX": {"pos": "adv", "advtype": "ex"},
+"FW": {"pos": "x", "foreign": "foreign"},
+"HYPH": {"pos": "punct", "puncttype": "dash"},
+"IN": {"pos": "adp"},
+"JJ": {"pos": "adj", "degree": "pos"},
+"JJR": {"pos": "adj", "degree": "comp"},
+"JJS": {"pos": "adj", "degree": "sup"},
+"LS": {"pos": "punct", "numtype": "ord"},
+"MD": {"pos": "verb", "verbtype": "mod"},
+"NIL": {"pos": "no_tag"},
+"NN": {"pos": "noun", "number": "sing"},
+"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
+"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},
+"NNS": {"pos": "noun", "number": "plur"},
+"PDT": {"pos": "adj", "adjtype": "pdt", "prontype": "prn"},
+"POS": {"pos": "part", "poss": "poss"},
+"PRP": {"pos": "noun", "prontype": "prs"},
+"PRP$": {"pos": "adj", "prontype": "prs", "poss": "poss"},
+"RB": {"pos": "adv", "degree": "pos"},
+"RBR": {"pos": "adv", "degree": "comp"},
+"RBS": {"pos": "adv", "degree": "sup"},
+"RP": {"pos": "part"},
+"SYM": {"pos": "sym"},
+"TO": {"pos": "part", "parttype": "inf", "verbform": "inf"},
+"UH": {"pos": "intJ"},
+"VB": {"pos": "verb", "verbform": "inf"},
+"VBD": {"pos": "verb", "verbform": "fin", "tense": "past"},
+"VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"},
+"VBN": {"pos": "verb", "verbform": "part", "tense": "past", "aspect": "perf"},
+"VBP": {"pos": "verb", "verbform": "fin", "tense": "pres"},
+"VBZ": {"pos": "verb", "verbform": "fin", "tense": "pres", "number": "sing", "person": 3},
+"WDT": {"pos": "adj", "prontype": "int|rel"},
+"WP": {"pos": "noun", "prontype": "int|rel"},
+"WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"},
+"WRB": {"pos": "adv", "prontype": "int|rel"},
+"SP": {"pos": "space"},
+"ADD": {"pos": "x"},
+"NFP": {"pos": "punct"},
+"GW": {"pos": "x"},
+"AFX": {"pos": "x"},
+"HYPH": {"pos": "punct"},
+"XX": {"pos": "x"},
+"BES": {"pos": "verb"},
+"HVS": {"pos": "verb"}
+}
--- a/setup.py
+++ b/setup.py
@ -153,7 +153,7 @@ def main(modules, is_pypy):

 MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
             'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
-             'spacy.morphology', 
+             'spacy.morphology', 'spacy.tagger',
             'spacy.syntax.stateclass', 
             'spacy._ml', 'spacy._theano',
             'spacy.tokenizer', 'spacy.en.attrs',
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@ -91,6 +91,8 @@ cdef class Model:
            count_feats(counts[guess], feats, n_feats, -cost)
            self._model.update(counts)

-    def end_training(self):
+    def end_training(self, model_loc=None):
+        if model_loc is None:
+            model_loc = self.model_loc
        self._model.end_training()
-        self._model.dump(self.model_loc, freq_thresh=0)
+        self._model.dump(model_loc, freq_thresh=0)
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -84,3 +84,4 @@ cpdef enum attr_id_t:
    ENT_TYPE
    HEAD
    SPACY
+    PROB
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -1,181 +1,12 @@
-from __future__ import unicode_literals
+from __future__ import unicode_literals, print_function
+
 from os import path
-import re
-import struct
-import json

-from .. import orth
-from ..vocab import Vocab
-from ..tokenizer import Tokenizer
-from ..syntax.arc_eager import ArcEager
-from ..syntax.ner import BiluoPushDown
-from ..syntax.parser import ParserFactory
-from ..serialize.bits import BitArray
-from ..matcher import Matcher
+from ..language import Language

-from ..tokens import Doc
-from ..multi_words import RegexMerger
-
-from .pos import EnPosTagger
-from .pos import POS_TAGS
-from .attrs import get_flags
-from . import regexes
-
-from ..util import read_lang_data
-
-from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
-
-
-def get_lex_props(string, oov_prob=-30, is_oov=False):
-    return {
-        'flags': get_flags(string, is_oov=is_oov),
-        'length': len(string),
-        'orth': string,
-        'lower': string.lower(),
-        'norm': string,
-        'shape': orth.word_shape(string),
-        'prefix': string[0],
-        'suffix': string[-3:],
-        'cluster': 0,
-        'prob': oov_prob,
-        'sentiment': 0
-    }
-
-if_model_present = -1
 LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')

-
-class English(object):
-    """The English NLP pipeline.
-
-    Example:
-
-        Load data from default directory:
-
-            >>> nlp = English()
-            >>> nlp = English(data_dir=u'')
-
-        Load data from specified directory:
-    
-            >>> nlp = English(data_dir=u'path/to/data_directory')
-
-        Disable (and avoid loading) parts of the processing pipeline:
-
-            >>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
-        
-        Start with nothing loaded:
-
-            >>> nlp = English(data_dir=None)
-    """
-    ParserTransitionSystem = ArcEager
-    EntityTransitionSystem = BiluoPushDown
-
-    def __init__(self,
-      data_dir=LOCAL_DATA_DIR,
-      Tokenizer=Tokenizer.from_dir,
-      Tagger=EnPosTagger,
-      Parser=ParserFactory(ParserTransitionSystem),
-      Entity=ParserFactory(EntityTransitionSystem),
-      Matcher=Matcher.from_dir,
-      Packer=None,
-      load_vectors=True
-    ):
-        self.data_dir = data_dir
-
-        if path.exists(path.join(data_dir, 'vocab', 'oov_prob')):
-            oov_prob = float(open(path.join(data_dir, 'vocab', 'oov_prob')).read())
-        else:
-            oov_prob = None
-        
-        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
-                           get_lex_props=get_lex_props, load_vectors=load_vectors,
-                           pos_tags=POS_TAGS,
-                           oov_prob=oov_prob)
-        if Tagger is True:
-            Tagger = EnPosTagger
-        if Parser is True:
-            transition_system = self.ParserTransitionSystem
-            Parser = lambda s, d: parser.Parser(s, d, transition_system)
-        if Entity is True:
-            transition_system = self.EntityTransitionSystem
-            Entity = lambda s, d: parser.Parser(s, d, transition_system)
-
-        self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
-        
-        if Tagger and path.exists(path.join(data_dir, 'pos')):
-            self.tagger = Tagger(self.vocab.strings, data_dir)
-        else:
-            self.tagger = None
-        if Parser and path.exists(path.join(data_dir, 'deps')):
-            self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
-        else:
-            self.parser = None
-        if Entity and path.exists(path.join(data_dir, 'ner')):
-            self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
-        else:
-            self.entity = None
-        if Matcher:
-            self.matcher = Matcher(self.vocab, data_dir)
-        else:
-            self.matcher = None
-        if Packer:
-            self.packer = Packer(self.vocab, data_dir)
-        else:
-            self.packer = None
-        self.mwe_merger = RegexMerger([
-            ('IN', 'O', regexes.MW_PREPOSITIONS_RE),
-            ('CD', 'TIME', regexes.TIME_RE),
-            ('NNP', 'DATE', regexes.DAYS_RE),
-            ('CD', 'MONEY', regexes.MONEY_RE)])
-
-    def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False):
-        """Apply the pipeline to some text.  The text can span multiple sentences,
-        and can contain arbtrary whitespace.  Alignment into the original string
-        is preserved.
-        
-        Args:
-            text (unicode): The text to be processed.
-
-        Returns:
-            tokens (spacy.tokens.Doc):
-
-        >>> from spacy.en import English
-        >>> nlp = English()
-        >>> tokens = nlp('An example sentence. Another example sentence.')
-        >>> tokens[0].orth_, tokens[0].head.tag_
-        ('An', 'NN')
-        """
-        tokens = self.tokenizer(text)
-        if self.tagger and tag:
-            self.tagger(tokens)
-        if self.matcher and entity:
-            self.matcher(tokens)
-        if self.parser and parse:
-            self.parser(tokens)
-        if self.entity and entity:
-            self.entity(tokens)
-        if merge_mwes and self.mwe_merger is not None:
-            self.mwe_merger(tokens)
-        return tokens
-
-    def end_training(self, data_dir=None):
-        if data_dir is None:
-            data_dir = self.data_dir
-        self.parser.model.end_training()
-        self.entity.model.end_training()
-        self.tagger.model.end_training()
-        self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
-
-        with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
-            file_.write(
-                json.dumps([
-                    (TAG, list(self.tagger.freqs[TAG].items())),
-                    (DEP, list(self.parser.moves.freqs[DEP].items())),
-                    (ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())),
-                    (ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())),
-                    (HEAD, list(self.parser.moves.freqs[HEAD].items()))]))
-
-    @property
-    def tags(self):
-        """Deprecated. List of part-of-speech tag names."""
-        return self.tagger.tag_names
+class English(Language):
+    @classmethod
+    def default_data_dir(cls):
+        return LOCAL_DATA_DIR
--- a/spacy/en/lemmatizer.py
+++ b/spacy/en/lemmatizer.py
@ -1,105 +0,0 @@
-from __future__ import unicode_literals
-from os import path
-import codecs
-
-
-NOUN_RULES = (
-    ('s', ''),
-    ('ses', 's'),
-    ('ves', 'f'),
-    ('xes', 'x'),
-    ('zes', 'z'),
-    ('ches', 'ch'),
-    ('shes', 'sh'),
-    ('men', 'man'),
-    ('ies', 'y')
-)
-
-
-VERB_RULES = (
-    ("s", ""),
-    ("ies", "y"),
-    ("es", "e"),
-    ("es", ""),
-    ("ed", "e"),
-    ("ed", ""),
-    ("ing", "e"),
-    ("ing", "")
-)
-
-
-ADJ_RULES = (
-    ("er", ""),
-    ("est", ""),
-    ("er", "e"),
-    ("est", "e")
-)
-
-
-class Lemmatizer(object):
-    def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id):
-        self.noun_id = noun_id
-        self.verb_id = verb_id
-        self.adj_id = adj_id
-        self.index = {}
-        self.exc = {}
-        for pos in ['adj', 'adv', 'noun', 'verb']:
-            self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
-            self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
-
-    def __call__(self, string, pos):
-        if pos == self.noun_id:
-            return self.noun(string)
-        elif pos == self.verb_id:
-            return self.verb(string)
-        elif pos == self.adj_id:
-            return self.adj(string)
-        else:
-            raise Exception("Cannot lemmatize with unknown pos: %s" % pos)
-
-    def noun(self, string):
-        return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
-
-    def verb(self, string):
-        return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
-
-    def adj(self, string):
-        return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
-
-
-def lemmatize(string, index, exceptions, rules):
-    string = string.lower()
-    forms = []
-    if string in index:
-        forms.append(string)
-    forms.extend(exceptions.get(string, []))
-    for old, new in rules:
-        if string.endswith(old):
-            form = string[:len(string) - len(old)] + new
-            if form in index:
-                forms.append(form)
-    if not forms:
-        forms.append(string)
-    return set(forms)
-
-
-def read_index(loc):
-    index = set()
-    for line in codecs.open(loc, 'r', 'utf8'):
-        if line.startswith(' '):
-            continue
-        pieces = line.split()
-        word = pieces[0]
-        if word.count('_') == 0:
-            index.add(word)
-    return index
-
-
-def read_exc(loc):
-    exceptions = {}
-    for line in codecs.open(loc, 'r', 'utf8'):
-        if line.startswith(' '):
-            continue
-        pieces = line.split()
-        exceptions[pieces[0]] = tuple(pieces[1:])
-    return exceptions
--- a/spacy/en/pos.pxd
+++ b/spacy/en/pos.pxd
@ -1,26 +1,5 @@
-from preshed.maps cimport PreshMapArray
-from preshed.counter cimport PreshCounter
-from cymem.cymem cimport Pool
-
-from .._ml cimport Model
-from ..strings cimport StringStore
-from ..structs cimport TokenC, LexemeC, Morphology, PosTag
-from ..parts_of_speech cimport univ_pos_t
-from .lemmatizer import Lemmatizer
+from ..tagger cimport Tagger


-cdef class EnPosTagger:
-    cdef readonly Pool mem
-    cdef readonly StringStore strings
-    cdef readonly Model model
-    cdef public object lemmatizer
-    cdef PreshMapArray _morph_cache
-    cdef public dict freqs
-
-    cdef PosTag* tags
-    cdef readonly object tag_names
-    cdef readonly object tag_map
-    cdef readonly int n_tags
-
-    cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
-    cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
+cdef class EnPosTagger(Tagger):
+    pass
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -1,389 +1,11 @@
 from os import path
-import json
-import os
-import shutil

-from libc.string cimport memset
+from ..parts_of_speech cimport NOUN, VERB, ADJ

-from cymem.cymem cimport Address
-from thinc.typedefs cimport atom_t, weight_t
-from collections import defaultdict
-
-from ..parts_of_speech cimport univ_pos_t
-from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
-
-from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
-from ..structs cimport TokenC, Morphology, LexemeC
-from ..tokens.doc cimport Doc
-from ..morphology cimport set_morph_from_dict
-from .._ml cimport arg_max
-
-from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
-from ..typedefs cimport attr_t
-
-from .lemmatizer import Lemmatizer
+from ..lemmatizer import Lemmatizer


-cpdef enum en_person_t:
-    NO_PERSON
-    FIRST
-    SECOND
-    THIRD
-    NON_THIRD
-
-
-cpdef enum en_number_t:
-    NO_NUMBER
-    SINGULAR
-    PLURAL
-    MASS
-
-
-cpdef enum en_gender_t:
-    NO_GENDER
-    MASCULINE
-    FEMININE
-    NEUTER
-
-
-cpdef enum en_case_t:
-    NO_CASE
-    NOMINATIVE
-    GENITIVE
-    ACCUSATIVE
-    REFLEXIVE
-    DEMONYM
-
-
-cpdef enum en_tenspect_t:
-    NO_TENSE
-    BASE_VERB
-    PRESENT
-    PAST
-    PASSIVE
-    ING
-    MODAL
-
-
-cpdef enum misc_t:
-    NO_MISC
-    COMPARATIVE
-    SUPERLATIVE
-    RELATIVE
-    NAME
-
-
-cpdef enum:
-    P2_orth
-    P2_cluster
-    P2_shape
-    P2_prefix
-    P2_suffix
-    P2_pos
-    P2_lemma
-    P2_flags
-
-    P1_orth
-    P1_cluster
-    P1_shape
-    P1_prefix
-    P1_suffix
-    P1_pos
-    P1_lemma
-    P1_flags
-
-    W_orth
-    W_cluster
-    W_shape
-    W_prefix
-    W_suffix
-    W_pos
-    W_lemma
-    W_flags
-
-    N1_orth
-    N1_cluster
-    N1_shape
-    N1_prefix
-    N1_suffix
-    N1_pos
-    N1_lemma
-    N1_flags
-
-    N2_orth
-    N2_cluster
-    N2_shape
-    N2_prefix
-    N2_suffix
-    N2_pos
-    N2_lemma
-    N2_flags
-
-    N_CONTEXT_FIELDS
-
-
-POS_TAGS = {
-    'NULL': (NO_TAG, {}),
-    'EOL': (EOL, {}),
-    'CC': (CONJ, {}),
-    'CD': (NUM, {}),
-    'DT': (DET, {}),
-    'EX': (DET, {}),
-    'FW': (X, {}),
-    'IN': (ADP, {}),
-    'JJ': (ADJ, {}),
-    'JJR': (ADJ, {'misc': COMPARATIVE}),
-    'JJS': (ADJ, {'misc': SUPERLATIVE}),
-    'LS': (X, {}),
-    'MD': (VERB, {'tenspect': MODAL}),
-    'NN': (NOUN, {}),
-    'NNS': (NOUN, {'number': PLURAL}),
-    'NNP': (NOUN, {'misc': NAME}),
-    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
-    'PDT': (DET, {}),
-    'POS': (PRT, {'case': GENITIVE}),
-    'PRP': (PRON, {}),
-    'PRP$': (PRON, {'case': GENITIVE}),
-    'RB': (ADV, {}),
-    'RBR': (ADV, {'misc': COMPARATIVE}),
-    'RBS': (ADV, {'misc': SUPERLATIVE}),
-    'RP': (PRT, {}),
-    'SYM': (X, {}),
-    'TO': (PRT, {}),
-    'UH': (X, {}),
-    'VB': (VERB, {}),
-    'VBD': (VERB, {'tenspect': PAST}),
-    'VBG': (VERB, {'tenspect': ING}),
-    'VBN': (VERB, {'tenspect': PASSIVE}),
-    'VBP': (VERB, {'tenspect': PRESENT}),
-    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
-    'WDT': (DET, {'misc': RELATIVE}),
-    'WP': (PRON, {'misc': RELATIVE}),
-    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
-    'WRB': (ADV, {'misc': RELATIVE}),
-    '!': (PUNCT, {}),
-    '#': (PUNCT, {}),
-    '$': (PUNCT, {}),
-    "''": (PUNCT, {}),
-    "(": (PUNCT, {}),
-    ")": (PUNCT, {}),
-    "-LRB-": (PUNCT, {}),
-    "-RRB-": (PUNCT, {}),
-    ".": (PUNCT, {}),
-    ",": (PUNCT, {}),
-    "``": (PUNCT, {}),
-    ":": (PUNCT, {}),
-    "?": (PUNCT, {}),
-    "ADD": (X, {}),
-    "NFP": (PUNCT, {}),
-    "GW": (X, {}),
-    "AFX": (X, {}),
-    "HYPH": (PUNCT, {}),
-    "XX": (X, {}),
-    "BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
-    "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
-    "SP": (SPACE, {})
-}
-
-
-POS_TEMPLATES = (
-    (W_orth,),
-    (P1_lemma, P1_pos),
-    (P2_lemma, P2_pos),
-    (N1_orth,),
-    (N2_orth,),
-
-    (W_suffix,),
-    (W_prefix,),
-
-    (P1_pos,),
-    (P2_pos,),
-    (P1_pos, P2_pos),
-    (P1_pos, W_orth),
-    (P1_suffix,),
-    (N1_suffix,),
-
-    (W_shape,),
-    (W_cluster,),
-    (N1_cluster,),
-    (N2_cluster,),
-    (P1_cluster,),
-    (P2_cluster,),
-
-    (W_flags,),
-    (N1_flags,),
-    (N2_flags,),
-    (P1_flags,),
-    (P2_flags,),
-)
-
-
-cdef struct _CachedMorph:
-    Morphology morph
-    int lemma
-
-
-def setup_model_dir(tag_names, tag_map, templates, model_dir):
-    if path.exists(model_dir):
-        shutil.rmtree(model_dir)
-    os.mkdir(model_dir)
-    config = {
-        'templates': templates,
-        'tag_names': tag_names,
-        'tag_map': tag_map
-    }
-    with open(path.join(model_dir, 'config.json'), 'w') as file_:
-        json.dump(config, file_)
-
-
-cdef class EnPosTagger:
+cdef class EnPosTagger(Tagger):
    """A part-of-speech tagger for English"""
-    def __init__(self, StringStore strings, data_dir):
-        self.mem = Pool()
-        model_dir = path.join(data_dir, 'pos')
-        self.strings = strings
-        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
-        self.tag_names = sorted(cfg['tag_names'])
-        assert self.tag_names
-        self.n_tags = len(self.tag_names)
-        self.tag_map = cfg['tag_map']
-        cdef int n_tags = len(self.tag_names) + 1
-
-        self.model = Model(n_tags, cfg['templates'], model_dir)
-        self._morph_cache = PreshMapArray(n_tags)
-        self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
-        for i, tag in enumerate(sorted(self.tag_names)):
-            pos, props = self.tag_map[tag]
-            self.tags[i].id = i
-            self.tags[i].pos = pos
-            set_morph_from_dict(&self.tags[i].morph, props)
-        if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')):
-            self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
-                                                 'morphs.json'))))
-        self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
-        self.freqs = {TAG: defaultdict(int)}
-        for tag in self.tag_names:
-            self.freqs[TAG][self.strings[tag]] = 1
-        self.freqs[TAG][0] = 1
-
-    def __call__(self, Doc tokens):
-        """Apply the tagger, setting the POS tags onto the Doc object.
-
-        Args:
-            tokens (Doc): The tokens to be tagged.
-        """
-        if tokens.length == 0:
-            return 0
-        cdef int i
-        cdef atom_t[N_CONTEXT_FIELDS] context
-        cdef const weight_t* scores
-        for i in range(tokens.length):
-            if tokens.data[i].pos == 0:
-                fill_context(context, i, tokens.data)
-                scores = self.model.score(context)
-                guess = arg_max(scores, self.model.n_classes)
-                tokens.data[i].tag = self.strings[self.tag_names[guess]]
-                self.set_morph(i, &self.tags[guess], tokens.data)
-
-        tokens.is_tagged = True
-        tokens._py_tokens = [None] * tokens.length
-
-    def tag_from_strings(self, Doc tokens, object tag_strs):
-        cdef int i
-        for i in range(tokens.length):
-            tokens.data[i].tag = self.strings[tag_strs[i]]
-            self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])],
-                           tokens.data)
-        tokens.is_tagged = True
-        tokens._py_tokens = [None] * tokens.length
-
-    def train(self, Doc tokens, object gold_tag_strs):
-        cdef int i
-        cdef int loss
-        cdef atom_t[N_CONTEXT_FIELDS] context
-        cdef const weight_t* scores
-        golds = [self.tag_names.index(g) if g is not None else -1
-                 for g in gold_tag_strs]
-        correct = 0
-        for i in range(tokens.length):
-            fill_context(context, i, tokens.data)
-            scores = self.model.score(context)
-            guess = arg_max(scores, self.model.n_classes)
-            loss = guess != golds[i] if golds[i] != -1 else 0
-            self.model.update(context, guess, golds[i], loss)
-            tokens.data[i].tag = self.strings[self.tag_names[guess]]
-            self.set_morph(i, &self.tags[guess], tokens.data)
-            correct += loss == 0
-            self.freqs[TAG][tokens.data[i].tag] += 1
-        return correct
-
-    cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:
-        tokens[i].pos = tag.pos
-        cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
-        if cached is NULL:
-            cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
-            cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
-            cached.morph = tag.morph
-            self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
-        tokens[i].lemma = cached.lemma
-        tokens[i].morph = cached.morph
-
-    cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
-        if self.lemmatizer is None:
-            return lex.orth
-        cdef unicode py_string = self.strings[lex.orth]
-        if pos != NOUN and pos != VERB and pos != ADJ:
-            return lex.orth
-        cdef set lemma_strings
-        cdef unicode lemma_string
-        lemma_strings = self.lemmatizer(py_string, pos)
-        lemma_string = sorted(lemma_strings)[0]
-        lemma = self.strings[lemma_string]
-        return lemma
-
-    def load_morph_exceptions(self, dict exc):
-        cdef unicode pos_str
-        cdef unicode form_str
-        cdef unicode lemma_str
-        cdef dict entries
-        cdef dict props
-        cdef int lemma
-        cdef attr_t orth
-        cdef int pos
-        for pos_str, entries in exc.items():
-            pos = self.tag_names.index(pos_str)
-            for form_str, props in entries.items():
-                lemma_str = props.get('L', form_str)
-                orth = self.strings[form_str]
-                cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
-                cached.lemma = self.strings[lemma_str]
-                set_morph_from_dict(&cached.morph, props)
-                self._morph_cache.set(pos, orth, <void*>cached)
-
-
-cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
-    _fill_from_token(&context[P2_orth], &tokens[i-2])
-    _fill_from_token(&context[P1_orth], &tokens[i-1])
-    _fill_from_token(&context[W_orth], &tokens[i])
-    _fill_from_token(&context[N1_orth], &tokens[i+1])
-    _fill_from_token(&context[N2_orth], &tokens[i+2])
-
-
-cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
-    context[0] = t.lex.lower
-    context[1] = t.lex.cluster
-    context[2] = t.lex.shape
-    context[3] = t.lex.prefix
-    context[4] = t.lex.suffix
-    context[5] = t.tag
-    context[6] = t.lemma
-    if t.lex.flags & (1 << IS_ALPHA):
-        context[7] = 1
-    elif t.lex.flags & (1 << IS_PUNCT):
-        context[7] = 2
-    elif t.lex.flags & (1 << LIKE_URL):
-        context[7] = 3
-    elif t.lex.flags & (1 << LIKE_NUM):
-        context[7] = 4
-    else:
-        context[7] = 0
+    def make_lemmatizer(self, data_dir):
+        return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
--- a/spacy/fi/init.py
+++ b/spacy/fi/init.py
@ -0,0 +1,11 @@
+from __future__ import unicode_literals, print_function
+
+from os import path
+
+from ..language import Language
+
+
+class Finnish(Language):
+    @classmethod
+    def default_data_dir(cls):
+        return path.join(path.dirname(__file__), 'data')
--- a/spacy/language.py
+++ b/spacy/language.py
@ -0,0 +1,252 @@
+from os import path
+
+try:
+    import ujson as json
+except ImportError:
+    import json
+
+from .tokenizer import Tokenizer
+from .vocab import Vocab
+from .syntax.parser import Parser
+from .tagger import Tagger
+from .matcher import Matcher
+from .serialize.packer import Packer
+from ._ml import Model
+from . import attrs
+from . import orth
+from .syntax.ner import BiluoPushDown
+from .syntax.arc_eager import ArcEager
+
+from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
+
+
+class Language(object):
+    @staticmethod
+    def lower(string):
+        return string.lower()
+
+    @staticmethod
+    def norm(string):
+        return string
+    
+    @staticmethod
+    def shape(string):
+        return orth.word_shape(string)
+
+    @staticmethod
+    def prefix(string):
+        return string[0]
+
+    @staticmethod
+    def suffix(string):
+        return string[-3:]
+
+    @staticmethod
+    def prob(string):
+        return -30
+
+    @staticmethod
+    def cluster(string):
+        return 0
+
+    @staticmethod
+    def is_alpha(string):
+        return orth.is_alpha(string)
+
+    @staticmethod
+    def is_ascii(string):
+        return orth.is_ascii(string)
+
+    @staticmethod
+    def is_digit(string):
+        return string.isdigit()
+
+    @staticmethod
+    def is_lower(string):
+        return orth.is_lower(string)
+
+    @staticmethod
+    def is_punct(string):
+        return orth.is_punct(string)
+
+    @staticmethod
+    def is_space(string):
+        return string.isspace()
+
+    @staticmethod
+    def is_title(string):
+        return orth.is_title(string)
+
+    @staticmethod
+    def is_upper(string):
+        return orth.is_upper(string)
+
+    @staticmethod
+    def like_url(string):
+        return orth.like_url(string)
+
+    @staticmethod
+    def like_number(string):
+        return orth.like_number(string)
+
+    @staticmethod
+    def like_email(string):
+        return orth.like_email(string)
+
+    @classmethod
+    def default_lex_attrs(cls, data_dir=None):
+        return {
+            attrs.LOWER: cls.lower,
+            attrs.NORM: cls.norm,
+            attrs.SHAPE: cls.shape,
+            attrs.PREFIX: cls.prefix,
+            attrs.SUFFIX: cls.suffix,
+            attrs.CLUSTER: cls.cluster,
+            attrs.PROB: lambda string: -10.0,
+    
+            attrs.IS_ALPHA: cls.is_alpha,
+            attrs.IS_ASCII: cls.is_ascii,
+            attrs.IS_DIGIT: cls.is_digit,
+            attrs.IS_LOWER: cls.is_lower,
+            attrs.IS_PUNCT: cls.is_punct,
+            attrs.IS_SPACE: cls.is_space,
+            attrs.IS_TITLE: cls.is_title,
+            attrs.IS_UPPER: cls.is_upper,
+            attrs.LIKE_URL: cls.like_url,
+            attrs.LIKE_NUM: cls.like_number,
+            attrs.LIKE_EMAIL: cls.like_email,
+            attrs.IS_STOP: lambda string: False,
+            attrs.IS_OOV: lambda string: True
+        }
+
+    @classmethod
+    def default_dep_labels(cls):
+        return {0: {'ROOT': True}}
+
+    @classmethod
+    def default_ner_labels(cls):
+        return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
+
+    @classmethod
+    def default_data_dir(cls):
+        return path.join(path.dirname(__file__), 'data')
+
+    @classmethod
+    def default_vectors(cls, data_dir):
+        return None
+
+    @classmethod
+    def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None):
+        if data_dir is None:
+            data_dir = cls.default_data_dir()
+        if vectors is None:
+            vectors = cls.default_vectors(data_dir)
+        if get_lex_attr is None:
+            get_lex_attr = cls.default_lex_attrs(data_dir)
+        return Vocab.from_dir(
+                path.join(data_dir, 'vocab'),
+                get_lex_attr=get_lex_attr,
+                vectors=vectors)
+
+    @classmethod
+    def default_tokenizer(cls, vocab, data_dir):
+        if path.exists(data_dir):
+            return Tokenizer.from_dir(vocab, data_dir)
+        else:
+            return Tokenizer(vocab, {}, None, None, None)
+
+    @classmethod
+    def default_tagger(cls, vocab, data_dir):
+        if path.exists(data_dir):
+            return Tagger.from_dir(data_dir, vocab)
+        else:
+            return None
+
+    @classmethod
+    def default_parser(cls, vocab, data_dir):
+        if path.exists(data_dir):
+            return Parser.from_dir(data_dir, vocab.strings, ArcEager)
+        else:
+            return None
+
+    @classmethod
+    def default_entity(cls, vocab, data_dir):
+        if path.exists(data_dir):
+            return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
+        else:
+            return None
+
+    @classmethod
+    def default_matcher(cls, vocab, data_dir):
+        if path.exists(data_dir):
+            return Matcher.from_dir(data_dir, vocab)
+        else:
+            return None
+
+    def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
+                 parser=None, entity=None, matcher=None, serializer=None):
+        if data_dir is None:
+            data_dir = self.default_data_dir()
+        if vocab is None:
+            vocab = self.default_vocab(data_dir)
+        if tokenizer is None:
+            tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer'))
+        if tagger is None:
+            tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos'))
+        if entity is None:
+            entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner'))
+        if parser is None:
+            parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps'))
+        if matcher is None:
+            matcher = self.default_matcher(vocab, data_dir=data_dir)
+        self.vocab = vocab
+        self.tokenizer = tokenizer
+        self.tagger = tagger
+        self.parser = parser
+        self.entity = entity
+        self.matcher = matcher
+
+    def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False):
+        """Apply the pipeline to some text.  The text can span multiple sentences,
+        and can contain arbtrary whitespace.  Alignment into the original string
+        is preserved.
+        
+        Args:
+            text (unicode): The text to be processed.
+
+        Returns:
+            tokens (spacy.tokens.Doc):
+
+        >>> from spacy.en import English
+        >>> nlp = English()
+        >>> tokens = nlp('An example sentence. Another example sentence.')
+        >>> tokens[0].orth_, tokens[0].head.tag_
+        ('An', 'NN')
+        """
+        tokens = self.tokenizer(text)
+        if self.tagger and tag:
+            self.tagger(tokens)
+        if self.matcher and entity:
+            self.matcher(tokens)
+        if self.parser and parse:
+            self.parser(tokens)
+        if self.entity and entity:
+            self.entity(tokens)
+        return tokens
+
+    def end_training(self, data_dir=None):
+        if data_dir is None:
+            data_dir = self.data_dir
+        self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
+        self.entity.model.end_training(path.join(data_dir, 'ner', 'model'))
+        self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
+        self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
+
+        with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
+            file_.write(
+                json.dumps([
+                    (TAG, list(self.tagger.freqs[TAG].items())),
+                    (DEP, list(self.parser.moves.freqs[DEP].items())),
+                    (ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())),
+                    (ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())),
+                    (HEAD, list(self.parser.moves.freqs[HEAD].items()))]))
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -0,0 +1,86 @@
+from __future__ import unicode_literals
+from os import path
+import codecs
+
+try:
+    import ujson as json
+except ImportError:
+    import json
+
+from .parts_of_speech import NOUN, VERB, ADJ
+
+
+class Lemmatizer(object):
+    @classmethod
+    def from_dir(cls, data_dir):
+        index = {}
+        exc = {}
+        for pos in ['adj', 'adv', 'noun', 'verb']:
+            index[pos] = read_index(path.join(data_dir, 'index.%s' % pos))
+            exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos))
+        rules = json.load(open(path.join(data_dir, 'lemma_rules.json')))
+        return cls(index, exc, rules)
+
+    def __init__(self, index, exceptions, rules):
+        self.index = index
+        self.exc = exceptions
+        self.rules = rules
+
+    def __call__(self, string, pos):
+        if pos == NOUN:
+            pos = 'noun'
+        elif pos == VERB:
+            pos = 'verb'
+        elif pos == ADJ:
+            pos = 'adj'
+        else:
+            return string
+        lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, []))
+        return min(lemmas)
+
+    def noun(self, string):
+        return self(string, 'noun')
+
+    def verb(self, string):
+        return self(string, 'verb')
+
+    def adj(self, string):
+        return self(string, 'adj')
+
+
+def lemmatize(string, index, exceptions, rules):
+    string = string.lower()
+    forms = []
+    if string in index:
+        forms.append(string)
+    forms.extend(exceptions.get(string, []))
+    for old, new in rules:
+        if string.endswith(old):
+            form = string[:len(string) - len(old)] + new
+            if form in index:
+                forms.append(form)
+    if not forms:
+        forms.append(string)
+    return set(forms)
+
+
+def read_index(loc):
+    index = set()
+    for line in codecs.open(loc, 'r', 'utf8'):
+        if line.startswith(' '):
+            continue
+        pieces = line.split()
+        word = pieces[0]
+        if word.count('_') == 0:
+            index.add(word)
+    return index
+
+
+def read_exc(loc):
+    exceptions = {}
+    for line in codecs.open(loc, 'r', 'utf8'):
+        if line.startswith(' '):
+            continue
+        pieces = line.split()
+        exceptions[pieces[0]] = tuple(pieces[1:])
+    return exceptions
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -17,6 +17,7 @@ cdef class Lexeme:
    cdef readonly attr_t orth

    @staticmethod
+<<<<<<< HEAD
    cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length):
        cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth)
        self.c = lex
@ -41,11 +42,30 @@ cdef class Lexeme:
            lex.suffix = value
        elif name == CLUSTER:
            lex.cluster = value
+=======
+    cdef inline int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
+        lex.length = props['length']
+        lex.orth = vocab.strings[props['orth']]
+        lex.lower = vocab.strings[props['lower']]
+        lex.norm = vocab.strings[props['norm']]
+        lex.shape = vocab.strings[props['shape']]
+        lex.prefix = vocab.strings[props['prefix']]
+        lex.suffix = vocab.strings[props['suffix']]
+
+        lex.cluster = props['cluster']
+        lex.prob = props['prob']
+        lex.sentiment = props['sentiment']
+
+        lex.flags = props['flags']
+>>>>>>> de

    @staticmethod
    cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
        if feat_name < (sizeof(flags_t) * 8):
-            return Lexeme.check_flag(lex, feat_name)
+            if Lexeme.check_flag(lex, feat_name):
+                return 1
+            else:
+                return 0
        elif feat_name == ID:
            return lex.id
        elif feat_name == ORTH:
@ -66,9 +86,29 @@ cdef class Lexeme:
            return lex.cluster
        else:
            return 0
+    
+    @staticmethod
+    cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
+        if name < (sizeof(flags_t) * 8):
+            Lexeme.set_flag(lex, name, value)
+        elif name == ID:
+            lex.id = value
+        elif name == LOWER:
+            lex.lower = value
+        elif name == NORM:
+            lex.norm = value
+        elif name == SHAPE:
+            lex.shape = value
+        elif name == PREFIX:
+            lex.prefix = value
+        elif name == SUFFIX:
+            lex.suffix = value
+        elif name == CLUSTER:
+            lex.cluster = value

    @staticmethod
    cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
+<<<<<<< HEAD
        return lexeme.flags & (1 << flag_id)

    @staticmethod
@ -78,3 +118,17 @@ cdef class Lexeme:
            lexeme.flags |= one << flag_id
        else:
            lexeme.flags &= ~(one << flag_id)
+=======
+        if lexeme.flags & (1 << flag_id):
+            return True
+        else:
+            return False
+
+    @staticmethod
+    cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil:
+        cdef flags_t one = 1
+        if value:
+            lex.flags |= one << flag_id
+        else:
+            lex.flags &= ~(one << flag_id)
+>>>>>>> de
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -27,6 +27,17 @@ cdef class Lexeme:
        self.vocab = vocab
        self.orth = orth
        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
+        assert self.c.orth == orth
+
+    def py_set_flag(self, attr_id_t flag_id):
+        Lexeme.set_flag(self.c, flag_id, True)
+    
+    def py_check_flag(self, attr_id_t flag_id):
+        return True if Lexeme.check_flag(self.c, flag_id) else False
+
+    property orth_:
+        def __get__(self):
+            return self.vocab.strings[self.c.orth]

    property lower:
        def __get__(self): return self.c.lower
@ -48,9 +59,13 @@ cdef class Lexeme:
        def __get__(self): return self.c.suffix
        def __set__(self, int x): self.c.suffix = x
    
-    property orth_:
-        def __get__(self):
-            return self.vocab.strings[self.c.orth]
+    property cluster:
+        def __get__(self): return self.c.suffix
+        def __set__(self, int x): self.c.suffix = x
+ 
+    property prob:
+        def __get__(self): return self.c.suffix
+        def __set__(self, int x): self.c.suffix = x

    property lower_:
        def __get__(self): return self.vocab.strings[self.c.lower]
@ -72,6 +87,10 @@ cdef class Lexeme:
        def __get__(self): return self.c.suffix
        def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]

+    property flags:
+        def __get__(self): return self.c.flags
+        def __set__(self, flags_t x): self.c.flags = x
+
    property is_oov:
        def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_OOV, x)
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -8,6 +8,7 @@ from cymem.cymem cimport Pool
 from libcpp.vector cimport vector

 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
+from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
 from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab
@ -53,6 +54,8 @@ cdef int match(const Pattern* pattern, const TokenC* token) except -1:
    cdef int i
    for i in range(pattern.length):
        if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value:
+            print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value
+            print get_token_attr(token, pattern.spec[i].attr)
            return False
    return True

@ -76,7 +79,10 @@ def _convert_strings(token_specs, string_store):
                attr = map_attr_name(attr)
            if isinstance(value, basestring):
                value = string_store[value]
+            if isinstance(value, bool):
+                value = int(value)
            converted[-1].append((attr, value))
+            print "Converted", converted[-1]
    return converted
    

@ -92,6 +98,32 @@ def map_attr_name(attr):
        return SHAPE
    elif attr == 'NORM':
        return NORM
+    elif attr == 'FLAG13':
+        return FLAG13
+    elif attr == 'FLAG14':
+        return FLAG14
+    elif attr == 'FLAG15':
+        return FLAG15
+    elif attr == 'FLAG16':
+        return FLAG16
+    elif attr == 'FLAG17':
+        return FLAG17
+    elif attr == 'FLAG18':
+        return FLAG18
+    elif attr == 'FLAG19':
+        return FLAG19
+    elif attr == 'FLAG20':
+        return FLAG20
+    elif attr == 'FLAG21':
+        return FLAG21
+    elif attr == 'FLAG22':
+        return FLAG22
+    elif attr == 'FLAG23':
+        return FLAG23
+    elif attr == 'FLAG24':
+        return FLAG24
+    elif attr == 'FLAG25':
+        return FLAG25
    else:
        raise Exception("TODO: Finish supporting attr mapping %s" % attr)

@ -99,14 +131,28 @@ def map_attr_name(attr):
 cdef class Matcher:
    cdef Pool mem
    cdef vector[Pattern*] patterns
-    cdef readonly int n_patterns
+    cdef readonly Vocab vocab

    def __init__(self, vocab, patterns):
        self.vocab = vocab
        self.mem = Pool()
+        self.vocab = vocab
        for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
            self.add(entity_key, etype, attrs, specs)

+    @classmethod
+    def from_dir(cls, data_dir, Vocab vocab):
+        patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json')
+        if path.exists(patterns_loc):
+            patterns_data = open(patterns_loc).read()
+            patterns = json.loads(patterns_data)
+            return cls(vocab, patterns)
+        else:
+            return cls(vocab, {})
+
+    property n_patterns:
+        def __get__(self): return self.patterns.size()
+
    def add(self, entity_key, etype, attrs, specs):
        if isinstance(entity_key, basestring):
            entity_key = self.vocab.strings[entity_key]
@ -120,16 +166,6 @@ cdef class Matcher:
            spec = _convert_strings(spec, self.vocab.strings)
            self.patterns.push_back(init_pattern(self.mem, spec, etype))

-    @classmethod
-    def from_dir(cls, vocab, data_dir):
-        patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json')
-        if path.exists(patterns_loc):
-            patterns_data = open(patterns_loc).read()
-            patterns = json.loads(patterns_data)
-            return cls(vocab, patterns)
-        else:
-            return cls(vocab, {})
-
    def __call__(self, Doc doc):
        cdef vector[Pattern*] partials
        cdef int n_partials = 0
@ -139,11 +175,13 @@ cdef class Matcher:
        cdef Pattern* state
        matches = []
        for token_i in range(doc.length):
+            print 'check', doc[token_i].orth_
            token = &doc.data[token_i]
            q = 0
            for i in range(partials.size()):
                state = partials.at(i)
                if match(state, token):
+                    print 'match!'
                    if is_final(state):
                        matches.append(get_entity(state, token, token_i))
                    else:
@ -153,6 +191,7 @@ cdef class Matcher:
            for i in range(self.n_patterns):
                state = self.patterns[i]
                if match(state, token):
+                    print 'match!'
                    if is_final(state):
                        matches.append(get_entity(state, token, token_i))
                    else:
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -1,4 +1,755 @@
-from .structs cimport TokenC, Morphology, PosTag
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMapArray
+from libc.stdint cimport uint64_t
+
+from .structs cimport TokenC
+from .strings cimport StringStore
+from .typedefs cimport attr_t
+from .parts_of_speech cimport univ_pos_t


-cdef int set_morph_from_dict(Morphology* morph, dict props) except -1
+cdef struct RichTagC:
+    uint64_t morph
+    int id
+    univ_pos_t pos
+    attr_t name
+
+
+cdef struct MorphAnalysisC:
+    RichTagC tag
+    attr_t lemma
+
+
+cdef class Morphology:
+    cdef readonly Pool mem
+    cdef readonly StringStore strings
+    cdef public object lemmatizer
+    cdef public object n_tags
+    cdef public object reverse_index
+    cdef public object tag_names
+
+    cdef RichTagC* rich_tags
+    cdef PreshMapArray _cache
+
+    cdef int assign_tag(self, TokenC* token, tag) except -1
+
+    cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
+
+
+
+#
+#cpdef enum Feature_t:
+#    Abbr
+#    AdpType
+#    AdvType
+#    ConjType
+#    Connegative
+#    Derivation
+#    Echo
+#    Foreign
+#    Gender_dat
+#    Gender_erg
+#    Gender_psor
+#    Hyph
+#    InfForm
+#    NameType
+#    NounType
+#    NumberAbs
+#    NumberDat
+#    NumberErg
+#    NumberPsee
+#    NumberPsor
+#    NumForm
+#    NumValue
+#    PartForm
+#    PartType
+#    Person_abs
+#    Person_dat
+#    Person_psor
+#    Polite
+#    Polite_abs
+#    Polite_dat
+#    Prefix
+#    PrepCase
+#    PunctSide
+#    PunctType
+#    Style
+#    Typo
+#    Variant
+#    VerbType
+#
+#
+#cpdef enum Animacy:
+#    Anim
+#    Inam
+#
+#
+#cpdef enum Aspect:
+#    Freq
+#    Imp
+#    Mod
+#    None_
+#    Perf
+#
+#
+#cpdef enum Case1:
+#    Nom
+#    Gen
+#    Acc
+#    Dat
+#    Voc
+#    Abl
+#    
+#cdef enum Case2:
+#    Abe
+#    Abs
+#    Ade
+#    All
+#    Cau
+#    Com
+#    Del
+#    Dis
+#
+#cdef enum Case3:
+#    Ela
+#    Ess
+#    Ill
+#    Ine
+#    Ins
+#    Loc
+#    Lat
+#    Par
+#
+#cdef enum Case4:
+#    Sub
+#    Sup
+#    Tem
+#    Ter
+#    Tra
+#
+#
+#cpdef enum Definite:
+#    Two
+#    Def
+#    Red
+#    Ind
+#
+#
+#cpdef enum Degree:
+#    Cmp
+#    Comp
+#    None_
+#    Pos
+#    Sup
+#    Abs
+#    Com
+#    Degree # du
+#
+#
+#cpdef enum Gender:
+#    Com
+#    Fem
+#    Masc
+#    Neut
+#
+#
+#cpdef enum Mood:
+#    Cnd
+#    Imp
+#    Ind
+#    N
+#    Pot
+#    Sub
+#    Opt
+#
+#
+#cpdef enum Negative:
+#    Neg
+#    Pos
+#    Yes
+#
+#
+#cpdef enum Number:
+#    Com
+#    Dual
+#    None_
+#    Plur
+#    Sing
+#    Ptan # bg
+#    Count # bg
+#
+#
+#cpdef enum NumType:
+#    Card
+#    Dist
+#    Frac
+#    Gen
+#    Mult
+#    None_
+#    Ord
+#    Sets
+#
+#
+#cpdef enum Person:
+#    One
+#    Two
+#    Three
+#    None_
+#
+#
+#cpdef enum Poss:
+#    Yes
+#
+#
+#cpdef enum PronType1:
+#    AdvPart
+#    Art
+#    Default
+#    Dem
+#    Ind
+#    Int
+#    Neg
+#
+#cpdef enum PronType2:
+#    Prs
+#    Rcp
+#    Rel
+#    Tot
+#    Clit
+#    Exc # es, ca, it, fa
+#    Clit # it
+#
+#
+#cpdef enum Reflex:
+#    Yes
+#
+#
+#cpdef enum Tense:
+#    Fut
+#    Imp
+#    Past
+#    Pres
+#
+#cpdef enum VerbForm1:
+#    Fin
+#    Ger
+#    Inf
+#    None_
+#    Part
+#    PartFut
+#    PartPast
+#
+#cpdef enum VerbForm2:
+#    PartPres
+#    Sup
+#    Trans
+#    Gdv # la
+#
+#
+#cpdef enum Voice:
+#    Act
+#    Cau
+#    Pass
+#    Mid # gkc
+#    Int # hb
+#
+#
+#cpdef enum Abbr:
+#    Yes # cz, fi, sl, U
+#
+#cpdef enum AdpType:
+#    Prep # cz, U
+#    Post # U
+#    Voc # cz
+#    Comprep # cz
+#    Circ # U
+#    Voc # U
+#
+#
+#cpdef enum AdvType1:
+#    # U
+#    Man
+#    Loc
+#    Tim
+#    Deg
+#    Cau
+#    Mod
+#    Sta
+#    Ex
+#
+#cpdef enum AdvType2:
+#    Adadj
+#
+#cpdef enum ConjType:
+#    Oper # cz, U
+#    Comp # cz, U
+#
+#cpdef enum Connegative:
+#    Yes # fi
+#
+#
+#cpdef enum Derivation1:
+#    Minen # fi
+#    Sti # fi
+#    Inen # fi
+#    Lainen # fi
+#    Ja # fi
+#    Ton # fi
+#    Vs # fi
+#    Ttain # fi
+#
+#cpdef enum Derivation2:
+#    Ttaa
+#
+#
+#cpdef enum Echo:
+#    Rdp # U
+#    Ech # U
+#
+#
+#cpdef enum Foreign:
+#    Foreign # cz, fi, U
+#    Fscript # cz, fi, U
+#    Tscript # cz, U
+#    Yes # sl
+#
+#
+#cpdef enum Gender_dat:
+#    Masc # bq, U
+#    Fem # bq, U
+#
+#
+#cpdef enum Gender_erg:
+#    Masc # bq
+#    Fem # bq
+#
+#
+#cpdef enum Gender_psor:
+#    Masc # cz, sl, U
+#    Fem # cz, sl, U
+#    Neut # sl
+#
+#
+#cpdef enum Hyph:
+#    Yes # cz, U
+#
+#
+#cpdef enum InfForm:
+#    One # fi
+#    Two # fi
+#    Three # fi
+#
+#
+#cpdef enum NameType:
+#    Geo # U, cz
+#    Prs # U, cz
+#    Giv # U, cz
+#    Sur # U, cz
+#    Nat # U, cz
+#    Com # U, cz
+#    Pro # U, cz
+#    Oth # U, cz
+#
+#
+#cpdef enum NounType:
+#    Com # U
+#    Prop # U
+#    Class # U
+#
+#cpdef enum Number_abs:
+#    Sing # bq, U
+#    Plur # bq, U
+#
+#cpdef enum Number_dat:
+#    Sing # bq, U
+#    Plur # bq, U
+#
+#cpdef enum Number_erg:
+#    Sing # bq, U
+#    Plur # bq, U
+#
+#cpdef enum Number_psee:
+#    Sing # U
+#    Plur # U
+#
+#
+#cpdef enum Number_psor:
+#    Sing # cz, fi, sl, U
+#    Plur # cz, fi, sl, U
+#
+#
+#cpdef enum NumForm:
+#    Digit # cz, sl, U
+#    Roman # cz, sl, U
+#    Word # cz, sl, U
+#
+#
+#cpdef enum NumValue:
+#    One # cz, U
+#    Two # cz, U
+#    Three # cz, U
+#
+#
+#cpdef enum PartForm:
+#    Pres # fi
+#    Past # fi
+#    Agt # fi
+#    Neg # fi
+#
+#
+#cpdef enum PartType:
+#    Mod # U
+#    Emp # U
+#    Res # U
+#    Inf # U
+#    Vbp # U
+#
+#cpdef enum Person_abs:
+#    One # bq, U
+#    Two # bq, U
+#    Three # bq, U
+#
+#
+#cpdef enum Person_dat:
+#    One # bq, U
+#    Two # bq, U
+#    Three # bq, U
+#
+#
+#cpdef enum Person_erg:
+#    One # bq, U
+#    Two # bq, U
+#    Three # bq, U
+#
+#
+#cpdef enum Person_psor:
+#    One # fi, U
+#    Two # fi, U
+#    Three # fi, U
+#
+#
+#cpdef enum Polite:
+#    Inf # bq, U
+#    Pol # bq, U
+#
+#
+#cpdef enum Polite_abs:
+#    Inf # bq, U
+#    Pol # bq, U
+#
+#
+#cpdef enum Polite_erg:
+#    Inf # bq, U
+#    Pol # bq, U
+#
+#
+#cpdef enum Polite_dat:
+#    Inf # bq, U
+#    Pol # bq, U
+#
+#
+#cpdef enum Prefix:
+#    Yes # U
+#
+#
+#cpdef enum PrepCase:
+#    Npr # cz
+#    Pre # U
+#
+#
+#cpdef enum PunctSide:
+#    Ini # U
+#    Fin # U
+#
+#cpdef enum PunctType1:
+#    Peri # U
+#    Qest # U
+#    Excl # U
+#    Quot # U
+#    Brck # U
+#    Comm # U
+#    Colo # U
+#    Semi # U
+#
+#cpdef enum PunctType2:
+#    Dash # U
+#
+#
+#cpdef enum Style1:
+#    Arch # cz, fi, U
+#    Rare # cz, fi, U
+#    Poet # cz, U
+#    Norm # cz, U
+#    Coll # cz, U
+#    Vrnc # cz, U
+#    Sing # cz, U
+#    Expr # cz, U
+#
+#
+#cpdef enum Style2:
+#    Derg # cz, U
+#    Vulg # cz, U
+#
+#
+#cpdef enum Typo:
+#    Yes # fi, U
+#
+#
+#cpdef enum Variant:
+#    Short # cz
+#    Bound # cz, sl
+#
+#
+#cpdef enum VerbType:
+#    Aux # U
+#    Cop # U
+#    Mod # U
+#    Light # U
+#
+
+cpdef enum Value_t:
+    Animacy_Anim
+    Animacy_Inam
+    Aspect_Freq
+    Aspect_Imp
+    Aspect_Mod
+    Aspect_None_
+    Aspect_Perf
+    Case_Abe
+    Case_Abl
+    Case_Abs
+    Case_Acc
+    Case_Ade
+    Case_All
+    Case_Cau
+    Case_Com
+    Case_Dat
+    Case_Del
+    Case_Dis
+    Case_Ela
+    Case_Ess
+    Case_Gen
+    Case_Ill
+    Case_Ine
+    Case_Ins
+    Case_Loc
+    Case_Lat
+    Case_Nom
+    Case_Par
+    Case_Sub
+    Case_Sup
+    Case_Tem
+    Case_Ter
+    Case_Tra
+    Case_Voc
+    Definite_Two
+    Definite_Def
+    Definite_Red
+    Definite_Ind
+    Degree_Cmp
+    Degree_Comp
+    Degree_None
+    Degree_Pos
+    Degree_Sup
+    Degree_Abs
+    Degree_Com
+    Degree_Dim # du
+    Gender_Com
+    Gender_Fem
+    Gender_Masc
+    Gender_Neut
+    Mood_Cnd
+    Mood_Imp
+    Mood_Ind
+    Mood_N
+    Mood_Pot
+    Mood_Sub
+    Mood_Opt
+    Negative_Neg
+    Negative_Pos
+    Negative_Yes
+    Number_Com
+    Number_Dual
+    Number_None
+    Number_Plur
+    Number_Sing
+    Number_Ptan # bg
+    Number_Count # bg
+    NumType_Card
+    NumType_Dist
+    NumType_Frac
+    NumType_Gen
+    NumType_Mult
+    NumType_None
+    NumType_Ord
+    NumType_Sets
+    Person_One
+    Person_Two
+    Person_Three
+    Person_None
+    Poss_Yes
+    PronType_AdvPart
+    PronType_Art
+    PronType_Default
+    PronType_Dem
+    PronType_Ind
+    PronType_Int
+    PronType_Neg
+    PronType_Prs
+    PronType_Rcp
+    PronType_Rel
+    PronType_Tot
+    PronType_Clit
+    PronType_Exc # es, ca, it, fa
+    Reflex_Yes
+    Tense_Fut
+    Tense_Imp
+    Tense_Past
+    Tense_Pres
+    VerbForm_Fin
+    VerbForm_Ger
+    VerbForm_Inf
+    VerbForm_None
+    VerbForm_Part
+    VerbForm_PartFut
+    VerbForm_PartPast
+    VerbForm_PartPres
+    VerbForm_Sup
+    VerbForm_Trans
+    VerbForm_Gdv # la
+    Voice_Act
+    Voice_Cau
+    Voice_Pass
+    Voice_Mid # gkc
+    Voice_Int # hb
+    Abbr_Yes # cz, fi, sl, U
+    AdpType_Prep # cz, U
+    AdpType_Post # U
+    AdpType_Voc # cz
+    AdpType_Comprep # cz
+    AdpType_Circ # U
+    AdvType_Man
+    AdvType_Loc
+    AdvType_Tim
+    AdvType_Deg
+    AdvType_Cau
+    AdvType_Mod
+    AdvType_Sta
+    AdvType_Ex
+    AdvType_Adadj
+    ConjType_Oper # cz, U
+    ConjType_Comp # cz, U
+    Connegative_Yes # fi
+    Derivation_Minen # fi
+    Derivation_Sti # fi
+    Derivation_Inen # fi
+    Derivation_Lainen # fi
+    Derivation_Ja # fi
+    Derivation_Ton # fi
+    Derivation_Vs # fi
+    Derivation_Ttain # fi
+    Derivation_Ttaa # fi
+    Echo_Rdp # U
+    Echo_Ech # U
+    Foreign_Foreign # cz, fi, U
+    Foreign_Fscript # cz, fi, U
+    Foreign_Tscript # cz, U
+    Foreign_Yes # sl
+    Gender_dat_Masc # bq, U
+    Gender_dat_Fem # bq, U
+    Gender_erg_Masc # bq
+    Gender_erg_Fem # bq
+    Gender_psor_Masc # cz, sl, U
+    Gender_psor_Fem # cz, sl, U
+    Gender_psor_Neut # sl
+    Hyph_Yes # cz, U
+    InfForm_One # fi
+    InfForm_Two # fi
+    InfForm_Three # fi
+    NameType_Geo # U, cz
+    NameType_Prs # U, cz
+    NameType_Giv # U, cz
+    NameType_Sur # U, cz
+    NameType_Nat # U, cz
+    NameType_Com # U, cz
+    NameType_Pro # U, cz
+    NameType_Oth # U, cz
+    NounType_Com # U
+    NounType_Prop # U
+    NounType_Class # U
+    Number_abs_Sing # bq, U
+    Number_abs_Plur # bq, U
+    Number_dat_Sing # bq, U
+    Number_dat_Plur # bq, U
+    Number_erg_Sing # bq, U
+    Number_erg_Plur # bq, U
+    Number_psee_Sing # U
+    Number_psee_Plur # U
+    Number_psor_Sing # cz, fi, sl, U
+    Number_psor_Plur # cz, fi, sl, U
+    NumForm_Digit # cz, sl, U
+    NumForm_Roman # cz, sl, U
+    NumForm_Word # cz, sl, U
+    NumValue_One # cz, U
+    NumValue_Two # cz, U
+    NumValue_Three # cz, U
+    PartForm_Pres # fi
+    PartForm_Past # fi
+    PartForm_Agt # fi
+    PartForm_Neg # fi
+    PartType_Mod # U
+    PartType_Emp # U
+    PartType_Res # U
+    PartType_Inf # U
+    PartType_Vbp # U
+    Person_abs_One # bq, U
+    Person_abs_Two # bq, U
+    Person_abs_Three # bq, U
+    Person_dat_One # bq, U
+    Person_dat_Two # bq, U
+    Person_dat_Three # bq, U
+    Person_erg_One # bq, U
+    Person_erg_Two # bq, U
+    Person_erg_Three # bq, U
+    Person_psor_One # fi, U
+    Person_psor_Two # fi, U
+    Person_psor_Three # fi, U
+    Polite_Inf # bq, U
+    Polite_Pol # bq, U
+    Polite_abs_Inf # bq, U
+    Polite_abs_Pol # bq, U
+    Polite_erg_Inf # bq, U
+    Polite_erg_Pol # bq, U
+    Polite_dat_Inf # bq, U
+    Polite_dat_Pol # bq, U
+    Prefix_Yes # U
+    PrepCase_Npr # cz
+    PrepCase_Pre # U
+    PunctSide_Ini # U
+    PunctSide_Fin # U
+    PunctType_Peri # U
+    PunctType_Qest # U
+    PunctType_Excl # U
+    PunctType_Quot # U
+    PunctType_Brck # U
+    PunctType_Comm # U
+    PunctType_Colo # U
+    PunctType_Semi # U
+    PunctType_Dash # U
+    Style_Arch # cz, fi, U
+    Style_Rare # cz, fi, U
+    Style_Poet # cz, U
+    Style_Norm # cz, U
+    Style_Coll # cz, U
+    Style_Vrnc # cz, U
+    Style_Sing # cz, U
+    Style_Expr # cz, U
+    Style_Derg # cz, U
+    Style_Vulg # cz, U
+    Style_Yes # fi, U
+    StyleVariant_StyleShort # cz
+    StyleVariant_StyleBound # cz, sl
+    VerbType_Aux # U
+    VerbType_Cop # U
+    VerbType_Mod # U
+    VerbType_Light # U
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,11 +1,89 @@
-# cython: embedsignature=True
+from os import path
+from .lemmatizer import Lemmatizer
+
+try:
+    import ujson as json
+except ImportError:
+    import json
+
+from .parts_of_speech import UNIV_POS_NAMES
+from .parts_of_speech cimport ADJ, VERB, NOUN


-cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
-    morph.number = props.get('number', 0)
-    morph.tenspect = props.get('tenspect', 0)
-    morph.mood = props.get('mood', 0)
-    morph.gender = props.get('gender', 0)
-    morph.person = props.get('person', 0)
-    morph.case = props.get('case', 0)
-    morph.misc = props.get('misc', 0)
+cdef class Morphology:
+    def __init__(self, StringStore string_store, tag_map, lemmatizer):
+        self.mem = Pool()
+        self.strings = string_store
+        self.lemmatizer = lemmatizer
+        self.n_tags = len(tag_map) + 1
+        self.tag_names = tuple(sorted(tag_map.keys()))
+        self.reverse_index = {}
+        
+        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
+        for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
+            self.rich_tags[i].id = i
+            self.rich_tags[i].name = self.strings[tag_str]
+            self.rich_tags[i].morph = 0
+            self.reverse_index[self.rich_tags[i].name] = i
+        self._cache = PreshMapArray(self.n_tags)
+
+    cdef int assign_tag(self, TokenC* token, tag) except -1:
+        cdef int tag_id
+        if isinstance(tag, basestring):
+            try:
+                tag_id = self.reverse_index[self.strings[tag]]
+            except KeyError:
+                print tag
+                raise
+        else:
+            tag_id = tag
+        analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
+        if analysis is NULL:
+            analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
+            analysis.tag = self.rich_tags[tag_id]
+            analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
+        token.lemma = analysis.lemma
+        token.pos = analysis.tag.pos
+        token.tag = analysis.tag.name
+        token.morph = analysis.tag.morph
+
+    cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
+        pass
+
+    def load_morph_exceptions(self, dict exc):
+        # Map (form, pos) to (lemma, rich tag)
+        cdef unicode pos_str
+        cdef unicode form_str
+        cdef unicode lemma_str
+        cdef dict entries
+        cdef dict props
+        cdef int lemma
+        cdef attr_t orth
+        cdef int pos
+        for tag_str, entries in exc.items():
+            tag = self.strings[tag_str]
+            rich_tag = self.rich_tags[self.reverse_index[tag]]
+            for form_str, props in entries.items():
+                cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
+                orth = self.strings[form_str]
+                for name_str, value_str in props.items():
+                    if name_str == 'L':
+                        cached.lemma = self.strings[value_str]
+                    else:
+                        self.assign_feature(&cached.tag.morph, name_str, value_str)
+                if cached.lemma == 0:
+                    cached.lemma = self.lemmatize(rich_tag.pos, orth)
+                self._cache.set(rich_tag.pos, orth, <void*>cached)
+
+    def lemmatize(self, const univ_pos_t pos, attr_t orth):
+        if self.lemmatizer is None:
+            return orth
+        cdef unicode py_string = self.strings[orth]
+        if pos != NOUN and pos != VERB and pos != ADJ:
+            return orth
+        cdef set lemma_strings
+        cdef unicode lemma_string
+        lemma_strings = self.lemmatizer(py_string, pos)
+        lemma_string = sorted(lemma_strings)[0]
+        lemma = self.strings[lemma_string]
+        return lemma
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@ -69,7 +69,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
 cpdef bint like_url(unicode string):
    # We're looking for things that function in text like URLs. So, valid URL
    # or not, anything they say http:// is going to be good.
-    if string.startswith('http://'):
+    if string.startswith('http://') or string.startswith('https://'):
        return True
    elif string.startswith('www.') and len(string) >= 5:
        return True
@ -92,6 +92,7 @@ cpdef bint like_url(unicode string):
    return False


+# TODO: This should live in the language.orth
 NUM_WORDS = set('zero one two three four five six seven eight nine ten'
                'eleven twelve thirteen fourteen fifteen sixteen seventeen'
                'eighteen nineteen twenty thirty forty fifty sixty seventy'
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@ -2,17 +2,22 @@
 cpdef enum univ_pos_t:
    NO_TAG
    ADJ
-    ADV
    ADP
+    ADV
+    AUX
    CONJ
    DET
+    INTJ
    NOUN
    NUM
+    PART
    PRON
-    PRT
+    PROPN
+    PUNCT
+    SCONJ
+    SYM
    VERB
    X
-    PUNCT
    EOL
    SPACE
    N_UNIV_TAGS
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@ -4,17 +4,22 @@ from __future__ import unicode_literals
 UNIV_POS_NAMES = {
    "NO_TAG": NO_TAG,
    "ADJ": ADJ,
-    "ADV": ADV,
    "ADP": ADP,
+    "ADV": ADV,
+    "AUX": AUX,
    "CONJ": CONJ,
    "DET": DET,
+    "INTJ": INTJ,
    "NOUN": NOUN,
    "NUM": NUM,
+    "PART": PART,
    "PRON": PRON,
-    "PRT": PRT,
+    "PROPN": PROPN,
+    "PUNCT": PUNCT,
+    "SCONJ": SCONJ,
+    "SYM": SYM,
    "VERB": VERB,
    "X": X,
-    "PUNCT": PUNCT,
-    "SPACE": SPACE,
-    "EOL": EOL
+    "EOL": EOL,
+    "SPACE": SPACE
 }
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -142,6 +142,8 @@ cdef class StringStore:
    def load(self, loc):
        with codecs.open(loc, 'r', 'utf8') as file_:
            strings = file_.read().split(SEPARATOR)
+        if strings == ['']:
+            return None
        cdef unicode string
        cdef bytes byte_string
        for string in strings: 
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -1,4 +1,4 @@
-from libc.stdint cimport uint8_t, uint32_t, int32_t
+from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t

 from .typedefs cimport flags_t, attr_t, hash_t
 from .parts_of_speech cimport univ_pos_t
@ -26,22 +26,6 @@ cdef struct LexemeC:
    float l2_norm


-cdef struct Morphology:
-    uint8_t number
-    uint8_t tenspect # Tense/aspect/voice
-    uint8_t mood
-    uint8_t gender
-    uint8_t person
-    uint8_t case
-    uint8_t misc
-
-
-cdef struct PosTag:
-    Morphology morph
-    int id
-    univ_pos_t pos
-
-
 cdef struct Entity:
    int start
    int end
@ -59,8 +43,8 @@ cdef struct Constituent:

 cdef struct TokenC:
    const LexemeC* lex
-    Morphology morph
    const Constituent* ctnt
+    uint64_t morph
    univ_pos_t pos
    bint spacy
    int tag
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@ -11,7 +11,6 @@ from .stateclass cimport StateClass


 cdef class Parser:
-    cdef readonly object cfg
    cdef readonly Model model
    cdef readonly TransitionSystem moves

--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -67,16 +67,22 @@ def ParserFactory(transition_system):


 cdef class Parser:
-    def __init__(self, StringStore strings, model_dir, transition_system):
+    def __init__(self, StringStore strings, transition_system, model):
+        self.moves = transition_system
+        self.model = model
+
+    @classmethod
+    def from_dir(cls, model_dir, strings, transition_system):
        if not os.path.exists(model_dir):
            print >> sys.stderr, "Warning: No model found at", model_dir
        elif not os.path.isdir(model_dir):
            print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory"
-        else:
-            self.cfg = Config.read(model_dir, 'config')
-            self.moves = transition_system(strings, self.cfg.labels)
-            templates = get_templates(self.cfg.features)
-            self.model = Model(self.moves.n_moves, templates, model_dir)
+        cfg = Config.read(model_dir, 'config')
+        moves = transition_system(strings, cfg.labels)
+        templates = get_templates(cfg.features)
+        model = Model(moves.n_moves, templates, model_dir)
+        return cls(strings, moves, model)
+

    def __call__(self, Doc tokens):
        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -0,0 +1,12 @@
+from ._ml cimport Model
+from .structs cimport TokenC
+from .vocab cimport Vocab
+
+
+cdef class Tagger:
+    cdef readonly Vocab vocab
+    cdef readonly Model model
+    cdef public dict freqs
+
+    cdef int predict(self, int i, const TokenC* tokens) except -1
+    cdef int update(self, int i, const TokenC* tokens, int gold) except -1
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -0,0 +1,220 @@
+import json
+from os import path
+from collections import defaultdict
+
+from thinc.typedefs cimport atom_t, weight_t
+
+from .typedefs cimport attr_t
+from .tokens.doc cimport Doc
+from .attrs cimport TAG
+from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
+from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
+
+from .attrs cimport *
+from ._ml cimport arg_max
+
+ 
+cpdef enum:
+    P2_orth
+    P2_cluster
+    P2_shape
+    P2_prefix
+    P2_suffix
+    P2_pos
+    P2_lemma
+    P2_flags
+
+    P1_orth
+    P1_cluster
+    P1_shape
+    P1_prefix
+    P1_suffix
+    P1_pos
+    P1_lemma
+    P1_flags
+
+    W_orth
+    W_cluster
+    W_shape
+    W_prefix
+    W_suffix
+    W_pos
+    W_lemma
+    W_flags
+
+    N1_orth
+    N1_cluster
+    N1_shape
+    N1_prefix
+    N1_suffix
+    N1_pos
+    N1_lemma
+    N1_flags
+
+    N2_orth
+    N2_cluster
+    N2_shape
+    N2_prefix
+    N2_suffix
+    N2_pos
+    N2_lemma
+    N2_flags
+
+    N_CONTEXT_FIELDS
+
+
+cdef class Tagger:
+    """A part-of-speech tagger for English"""
+    @classmethod
+    def read_config(cls, data_dir):
+        return json.load(open(path.join(data_dir, 'pos', 'config.json')))
+
+    @classmethod
+    def default_templates(cls):
+        return (
+            (W_orth,),
+            (P1_lemma, P1_pos),
+            (P2_lemma, P2_pos),
+            (N1_orth,),
+            (N2_orth,),
+
+            (W_suffix,),
+            (W_prefix,),
+
+            (P1_pos,),
+            (P2_pos,),
+            (P1_pos, P2_pos),
+            (P1_pos, W_orth),
+            (P1_suffix,),
+            (N1_suffix,),
+
+            (W_shape,),
+            (W_cluster,),
+            (N1_cluster,),
+            (N2_cluster,),
+            (P1_cluster,),
+            (P2_cluster,),
+
+            (W_flags,),
+            (N1_flags,),
+            (N2_flags,),
+            (P1_flags,),
+            (P2_flags,),
+        )
+
+    @classmethod
+    def blank(cls, vocab, templates):
+        model = Model(vocab.morphology.n_tags, templates, model_loc=None)
+        return cls(vocab, model)
+
+    @classmethod
+    def from_dir(cls, data_dir, vocab):
+        if path.exists(path.join(data_dir, 'templates.json')):
+            templates = json.loads(open(path.join(data_dir, 'templates.json')))
+        else:
+            templates = cls.default_templates()
+        model = Model(vocab.morphology.n_tags, templates, data_dir)
+        return cls(vocab, model)
+
+    def __init__(self, Vocab vocab, model):
+        self.vocab = vocab
+        self.model = model
+        
+        # TODO: Move this to tag map
+        self.freqs = {TAG: defaultdict(int)}
+        for tag in self.tag_names:
+            self.freqs[TAG][self.vocab.strings[tag]] = 1
+        self.freqs[TAG][0] = 1
+
+    @property
+    def tag_names(self):
+        return self.vocab.morphology.tag_names
+
+    def __call__(self, Doc tokens):
+        """Apply the tagger, setting the POS tags onto the Doc object.
+
+        Args:
+            tokens (Doc): The tokens to be tagged.
+        """
+        if tokens.length == 0:
+            return 0
+        cdef int i
+        cdef const weight_t* scores
+        for i in range(tokens.length):
+            if tokens.data[i].pos == 0:
+                guess = self.predict(i, tokens.data)
+                self.vocab.morphology.assign_tag(&tokens.data[i], guess)
+
+        tokens.is_tagged = True
+        tokens._py_tokens = [None] * tokens.length
+
+    def tag_from_strings(self, Doc tokens, object tag_strs):
+        cdef int i
+        for i in range(tokens.length):
+            self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
+        tokens.is_tagged = True
+        tokens._py_tokens = [None] * tokens.length
+
+    def train(self, Doc tokens, object gold_tag_strs):
+        assert len(tokens) == len(gold_tag_strs)
+        cdef int i
+        cdef int loss
+        cdef const weight_t* scores
+        try:
+            golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
+        except ValueError:
+            raise ValueError(
+                [g for g in gold_tag_strs if g is not None and g not in self.tag_names])
+        correct = 0
+        for i in range(tokens.length):
+            guess = self.update(i, tokens.data, golds[i])
+            loss = golds[i] != -1 and guess != golds[i]
+
+            self.vocab.morphology.assign_tag(&tokens.data[i], guess)
+            
+            correct += loss == 0
+            self.freqs[TAG][tokens.data[i].tag] += 1
+        return correct
+
+    cdef int predict(self, int i, const TokenC* tokens) except -1:
+        cdef atom_t[N_CONTEXT_FIELDS] context
+        _fill_from_token(&context[P2_orth], &tokens[i-2])
+        _fill_from_token(&context[P1_orth], &tokens[i-1])
+        _fill_from_token(&context[W_orth], &tokens[i])
+        _fill_from_token(&context[N1_orth], &tokens[i+1])
+        _fill_from_token(&context[N2_orth], &tokens[i+2])
+        scores = self.model.score(context)
+        return arg_max(scores, self.model.n_classes)
+
+    cdef int update(self, int i, const TokenC* tokens, int gold) except -1:
+        cdef atom_t[N_CONTEXT_FIELDS] context
+        _fill_from_token(&context[P2_orth], &tokens[i-2])
+        _fill_from_token(&context[P1_orth], &tokens[i-1])
+        _fill_from_token(&context[W_orth], &tokens[i])
+        _fill_from_token(&context[N1_orth], &tokens[i+1])
+        _fill_from_token(&context[N2_orth], &tokens[i+2])
+        scores = self.model.score(context)
+        guess = arg_max(scores, self.model.n_classes)
+        loss = guess != gold if gold != -1 else 0
+        self.model.update(context, guess, gold, loss)
+        return guess
+
+
+cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
+    context[0] = t.lex.lower
+    context[1] = t.lex.cluster
+    context[2] = t.lex.shape
+    context[3] = t.lex.prefix
+    context[4] = t.lex.suffix
+    context[5] = t.tag
+    context[6] = t.lemma
+    if t.lex.flags & (1 << IS_ALPHA):
+        context[7] = 1
+    elif t.lex.flags & (1 << IS_PUNCT):
+        context[7] = 2
+    elif t.lex.flags & (1 << LIKE_URL):
+        context[7] = 3
+    elif t.lex.flags & (1 << LIKE_NUM):
+        context[7] = 4
+    else:
+        context[7] = 0
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -4,15 +4,10 @@ from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool

 from .typedefs cimport hash_t
-from .structs cimport LexemeC, TokenC, Morphology
+from .structs cimport LexemeC, TokenC
 from .strings cimport StringStore
 from .tokens.doc cimport Doc
-from .vocab cimport Vocab, _Cached
-
-
-cdef union LexemesOrTokens:
-    const LexemeC* const* lexemes
-    TokenC* tokens
+from .vocab cimport Vocab, LexemesOrTokens, _Cached


 cdef class Tokenizer:
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap

-from .morphology cimport set_morph_from_dict
 from .strings cimport hash_string
 cimport cython

@ -29,7 +28,7 @@ cdef class Tokenizer:
        self._suffix_re = suffix_re
        self._infix_re = infix_re
        self.vocab = vocab
-        self._load_special_tokenization(rules, self.vocab.pos_tags)
+        self._load_special_tokenization(rules)

    @classmethod
    def from_dir(cls, Vocab vocab, data_dir):
@ -193,9 +192,7 @@ cdef class Tokenizer:
                tokens.push_back(prefixes[0][i], False)
        if string:
            cache_hit = self._try_cache(hash_string(string), tokens)
-            if cache_hit:
-                pass
-            else:
+            if not cache_hit:
                match = self.find_infix(string)
                if match is None:
                    tokens.push_back(self.vocab.get(tokens.mem, string), False)
@ -242,7 +239,7 @@ cdef class Tokenizer:
        match = self._suffix_re.search(string)
        return (match.end() - match.start()) if match is not None else 0

-    def _load_special_tokenization(self, object rules, object tag_map):
+    def _load_special_tokenization(self, special_cases):
        '''Add a special-case tokenization rule.
        '''
        cdef int i
@ -253,29 +250,11 @@ cdef class Tokenizer:
        cdef dict props
        cdef LexemeC** lexemes
        cdef hash_t hashed
-        for chunk, substrings in sorted(rules.items()):
-            tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
-            for i, props in enumerate(substrings):
-                form = props['F']
-                lemma = props.get("L", None)
-                tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
-                if lemma is not None:
-                    tokens[i].lemma = self.vocab.strings[lemma]
-                else:
-                    tokens[i].lemma = 0
-                if 'pos' in props:
-                    tokens[i].tag = self.vocab.strings[props['pos']]
-                    tokens[i].pos = tag_map[props['pos']][0]
-                    # These are defaults, which can be over-ridden by the
-                    # token-specific props.
-                    set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
-                    if tokens[i].lemma == 0:
-                        tokens[i].lemma = tokens[i].lex.orth
-                set_morph_from_dict(&tokens[i].morph, props)
+        for chunk, substrings in sorted(special_cases.items()):
            cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
            cached.length = len(substrings)
            cached.is_lex = False
-            cached.data.tokens = tokens
-            hashed = hash_string(chunk)
-            self._specials.set(hashed, cached)
-            self._cache.set(hashed, cached)
+            cached.data.tokens = self.vocab.make_fused_token(substrings)
+            key = hash_string(chunk)
+            self._specials.set(key, cached)
+            self._cache.set(key, cached)
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -12,11 +12,11 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil


 ctypedef const LexemeC* const_Lexeme_ptr
-ctypedef TokenC* TokenC_ptr
+ctypedef const TokenC* const_TokenC_ptr

 ctypedef fused LexemeOrToken:
    const_Lexeme_ptr
-    TokenC_ptr
+    const_TokenC_ptr


 cdef class Doc:
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -14,6 +14,7 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 from ..parts_of_speech import UNIV_POS_NAMES
 from ..parts_of_speech cimport CONJ, PUNCT, NOUN
 from ..parts_of_speech cimport univ_pos_t
+from ..lexeme cimport Lexeme
 from .spans cimport Span
 from .token cimport Token
 from ..serialize.bits cimport BitArray
@ -210,7 +211,7 @@ cdef class Doc:
        if self.length == self.max_length:
            self._realloc(self.length * 2)
        cdef TokenC* t = &self.data[self.length]
-        if LexemeOrToken is TokenC_ptr:
+        if LexemeOrToken is const_TokenC_ptr:
            t[0] = lex_or_tok[0]
        else:
            t.lex = lex_or_tok
@ -218,6 +219,7 @@ cdef class Doc:
            t.idx = 0
        else:
            t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
+        assert t.lex.orth != 0
        t.spacy = has_space
        self.length += 1
        self._py_tokens.append(None)
--- a/spacy/tokens/spans.pyx
+++ b/spacy/tokens/spans.pyx
@ -1,7 +1,7 @@
 from __future__ import unicode_literals
 from collections import defaultdict

-from ..structs cimport Morphology, TokenC, LexemeC
+from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t
 from ..attrs cimport attr_id_t
 from ..parts_of_speech cimport univ_pos_t
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -20,6 +20,8 @@ from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from ..attrs cimport IS_OOV

+from ..lexeme cimport Lexeme
+

 cdef class Token:
    """An individual token --- i.e. a word, a punctuation symbol, etc.  Created
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -7,6 +7,7 @@ from murmurhash.mrmr cimport hash64
 from .structs cimport LexemeC, TokenC
 from .typedefs cimport utf8_t, attr_t, hash_t
 from .strings cimport StringStore
+from .morphology cimport Morphology


 cdef LexemeC EMPTY_LEXEME
@ -14,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME

 cdef union LexemesOrTokens:
    const LexemeC* const* lexemes
-    TokenC* tokens
+    const TokenC* tokens


 cdef struct _Cached:
@ -27,15 +28,18 @@ cdef class Vocab:
    cpdef public lexeme_props_getter
    cdef Pool mem
    cpdef readonly StringStore strings
-    cdef readonly object pos_tags
+    cpdef readonly Morphology morphology
    cdef readonly int length
    cdef public object _serializer
    cdef public object data_dir
-    cdef public float oov_prob
+    cdef public object get_lex_attr
+    cdef public object pos_tags

    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
+    cdef const TokenC* make_fused_token(self, substrings) except NULL
    
+    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL

--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -17,10 +17,12 @@ from .strings cimport hash_string
 from .orth cimport word_shape
 from .typedefs cimport attr_t
 from .cfile cimport CFile
+from .lemmatizer import Lemmatizer

 from cymem.cymem cimport Address
 from . import util
 from .serialize.packer cimport Packer
+from .attrs cimport PROB


 DEF MAX_VEC_SIZE = 100000
@ -35,30 +37,31 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
 cdef class Vocab:
    '''A map container for a language's LexemeC structs.
    '''
-    def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=False):
+    def __init__(self, get_lex_attr=None, tag_map=None, vectors=None):
        self.mem = Pool()
        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
        self.strings = StringStore()
-        #self.pos_tags = pos_tags if pos_tags is not None else {}
-        self.pos_tags = {}
-        
        self.get_lex_attr = get_lex_attr
-        self.repvec_length = 0
-        self.length = 0
-        self._add_lex_to_vocab(0, &EMPTY_LEXEME)
-        if data_dir is not None:
-            if not path.exists(data_dir):
-                raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
-            if not path.isdir(data_dir):
-                raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
-            self.load_lexemes(path.join(data_dir, 'strings.txt'),
-                              path.join(data_dir, 'lexemes.bin'))
-            if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
-                self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
-
+        self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {}))
+        
+        self.length = 1
        self._serializer = None
-        self.data_dir = data_dir
+
+    @classmethod
+    def from_dir(cls, data_dir, get_lex_attr=None, vectors=None):
+        if not path.exists(data_dir):
+            raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
+        if not path.isdir(data_dir):
+            raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
+
+        tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
+        cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
+
+        self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
+        if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
+            self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
+        return self

    property serializer:
        def __get__(self):
@ -84,7 +87,9 @@ cdef class Vocab:
        cdef LexemeC* lex
        cdef hash_t key = hash_string(string)
        lex = <LexemeC*>self._by_hash.get(key)
+        cdef size_t addr
        if lex != NULL:
+            assert lex.orth == self.strings[string]
            return lex
        else:
            return self._new_lexeme(mem, string)
@ -103,16 +108,29 @@ cdef class Vocab:
            return self._new_lexeme(mem, self.strings[orth])

    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
+        cdef hash_t key
        cdef bint is_oov = mem is not self.mem
+        mem = self.mem
        if len(string) < 3:
            mem = self.mem
        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
-        for attr, func in self.lex_attr_getters.items():
-            Lexeme.set_struct_attr(lex, attr, func(string))
+        lex.orth = self.strings[string]
+        lex.length = len(string)
+        lex.id = self.length
+        if self.get_lex_attr is not None:
+            for attr, func in self.get_lex_attr.items():
+                value = func(string)
+                if isinstance(value, unicode):
+                    value = self.strings[value]
+                if attr == PROB:
+                    lex.prob = value
+                else:
+                    Lexeme.set_struct_attr(lex, attr, value)
        if is_oov:
            lex.id = 0
        else:
-            self._add_lex_to_vocab(hash_string(string), lex)
+            key = hash_string(string)
+            self._add_lex_to_vocab(key, lex)
        assert lex != NULL, string
        return lex

@ -125,7 +143,7 @@ cdef class Vocab:
        cdef attr_t orth
        cdef size_t addr
        for orth, addr in self._by_orth.items():
-            yield Lexeme.from_ptr(<LexemeC*>addr, self, self.repvec_length)
+            yield Lexeme(self, orth)

    def __getitem__(self,  id_or_string):
        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
@ -142,23 +160,29 @@ cdef class Vocab:
              An instance of the Lexeme Python class, with data copied on
              instantiation.
        '''
-        cdef const LexemeC* lexeme
        cdef attr_t orth
-        if type(id_or_string) == int:
-            orth = id_or_string
-            lexeme = <LexemeC*>self._by_orth.get(orth)
-            if lexeme == NULL:
-                raise KeyError(id_or_string)
-            assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth))
-        elif type(id_or_string) == unicode:
-            lexeme = self.get(self.mem, id_or_string)
-            assert lexeme.orth == self.strings[id_or_string]
+        if type(id_or_string) == unicode:
+            orth = self.strings[id_or_string]
        else:
-            raise ValueError("Vocab unable to map type: "
-                "%s. Maps unicode --> Lexeme or "
-                "int --> Lexeme" % str(type(id_or_string)))
-        return Lexeme.from_ptr(<LexemeC*><void*>lexeme, self, self.repvec_length)
+            orth = id_or_string
+        return Lexeme(self, orth)

+    cdef const TokenC* make_fused_token(self, substrings) except NULL:
+        cdef int i
+        tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
+        for i, props in enumerate(substrings):
+            token = &tokens[i]
+            # Set the special tokens up to have morphology and lemmas if
+            # specified, otherwise use the part-of-speech tag (if specified)
+            token.lex = <LexemeC*>self.get(self.mem, props['F'])
+            if 'pos' in props:
+                self.morphology.assign_tag(token, props['pos'])
+            if 'L' in props:
+                tokens[i].lemma = self.strings[props['L']]
+            for feature, value in props.get('morph', {}).items():
+                self.morphology.assign_feature(&token.morph, feature, value)
+        return tokens
+    
    def dump(self, loc):
        if path.exists(loc):
            assert not path.isdir(loc)
--- a/tests/parser/test_initial_actions_parse.py
+++ b/tests/parser/test_initial_actions_parse.py
@ -1,6 +1,7 @@
 import pytest


+@pytest.mark.models
 def test_initial(EN):
    doc = EN.tokenizer(u'I ate the pizza with anchovies.')
    EN.tagger(doc)
--- a/tests/serialize/test_codecs.py
+++ b/tests/serialize/test_codecs.py
@ -41,25 +41,10 @@ def test_attribute():


 def test_vocab_codec():
-    def get_lex_props(string, prob):
-        return {
-            'flags': 0,
-            'length': len(string),
-            'orth': string,
-            'lower': string, 
-            'norm': string,
-            'shape': string,
-            'prefix': string[0],
-            'suffix': string[-3:],
-            'cluster': 0,
-            'prob': prob,
-            'sentiment': 0
-        }
-
    vocab = Vocab()
-    vocab['dog'] = get_lex_props('dog', 0.001)
-    vocab['the'] = get_lex_props('the', 0.05)
-    vocab['jumped'] = get_lex_props('jumped', 0.005)
+    lex = vocab['dog']
+    lex = vocab['the']
+    lex = vocab['jumped']

    codec = HuffmanCodec([(lex.orth, lex.prob) for lex in vocab])

--- a/tests/serialize/test_packer.py
+++ b/tests/serialize/test_packer.py
@ -5,6 +5,7 @@ import re
 import pytest
 import numpy

+from spacy.language import Language
 from spacy.vocab import Vocab
 from spacy.tokens.doc import Doc
 from spacy.tokenizer import Tokenizer
@ -17,30 +18,14 @@ from spacy.serialize.packer import Packer
 from spacy.serialize.bits import BitArray


-def get_lex_props(string, prob=-22, is_oov=False):
-    return {
-        'flags': 0,
-        'length': len(string),
-        'orth': string,
-        'lower': string, 
-        'norm': string,
-        'shape': string,
-        'prefix': string[0],
-        'suffix': string[-3:],
-        'cluster': 0,
-        'prob': prob,
-        'sentiment': 0
-    }
-
-
@pytest.fixture
 def vocab():
-    vocab = Vocab(get_lex_props=get_lex_props)
-    vocab['dog'] = get_lex_props('dog', 0.001)
+    vocab = Vocab(Language.default_lex_attrs())
+    lex = vocab['dog']
    assert vocab[vocab.strings['dog']].orth_ == 'dog'
-    vocab['the'] = get_lex_props('the', 0.01)
-    vocab['quick'] = get_lex_props('quick', 0.005)
-    vocab['jumped'] = get_lex_props('jumped', 0.007)
+    lex  = vocab['the']
+    lex = vocab['quick']
+    lex = vocab['jumped']
    return vocab


--- a/tests/tagger/test_spaces.py
+++ b/tests/tagger/test_spaces.py
@ -14,6 +14,7 @@ def tagged(EN):
    tokens = EN(string, tag=True, parse=False)
    return tokens

+@pytest.mark.models
 def test_spaces(tagged):
    assert tagged[0].pos != SPACE
    assert tagged[0].pos_ != 'SPACE'
--- a/tests/test_docs.py
+++ b/tests/test_docs.py
@ -1,80 +1,81 @@
 # -*- coding: utf-8 -*-
 """Sphinx doctest is just too hard. Manually paste doctest examples here"""
+import pytest

-@pytest.mark.models
-def test_1():
-    import spacy.en
-    from spacy.parts_of_speech import ADV
-    # Load the pipeline, and call it with some text.
-    nlp = spacy.en.English()
-    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’",
-                tag=True, parse=False)
-    o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
-    assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’"
-
-    o = nlp.vocab[u'back'].prob
-    assert o == -7.033305644989014
-    o = nlp.vocab[u'not'].prob
-    assert o == -5.332601070404053
-    o = nlp.vocab[u'quietly'].prob
-    assert o == -11.994928359985352
-
-
-@pytest.mark.models
-def test2():
-    import spacy.en
-    from spacy.parts_of_speech import ADV
-    nlp = spacy.en.English()
-    # Find log probability of Nth most frequent word
-    probs = [lex.prob for lex in nlp.vocab]
-    probs.sort()
-    is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
-    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
-    o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
-    o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
-
-@pytest.mark.models
-def test3():
-    import spacy.en
-    from spacy.parts_of_speech import ADV
-    nlp = spacy.en.English()
-    # Find log probability of Nth most frequent word
-    probs = [lex.prob for lex in nlp.vocab]
-    probs.sort()
-    is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
-    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
-    o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
-    assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
-
-    pleaded = tokens[7]
-    assert pleaded.repvec.shape == (300,)
-    o = pleaded.repvec[:5]
-    assert sum(o) != 0
-    from numpy import dot
-    from numpy.linalg import norm
-
-    cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
-    words = [w for w in nlp.vocab if w.is_lower and w.has_repvec]
-    words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
-    words.reverse()
-    o = [w.orth_ for w in words[0:20]]
-    assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded',
-                 u'pleads', u'testified', u'conspired', u'motioned', u'demurred',
-                 u'countersued', u'remonstrated', u'begged', u'apologised',
-                 u'consented', u'acquiesced', u'petitioned', u'quarreled',
-                 u'appealed', u'pleading']
-    o = [w.orth_ for w in words[50:60]]
-    assert o == [u'martialed', u'counselled', u'bragged',
-                 u'backtracked', u'caucused', u'refiled', u'dueled', u'mused',
-                 u'dissented', u'yearned']
-    o = [w.orth_ for w in words[100:110]]
-    assert o == [u'acquits', u'cabled', u'ducked', u'sentenced',
-                 u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed',
-                 u'clerked']
-    
-    #o = [w.orth_ for w in words[1000:1010]]
-    #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled',
-    #             u'posited', u'firebombed', u'slimed', u'deferred', u'sagged']
-    #o = [w.orth_ for w in words[50000:50010]]
-    #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid',
-    #             u'dirty', u'rims', u'artists']
+#@pytest.mark.models
+#def test_1():
+#    import spacy.en
+#    from spacy.parts_of_speech import ADV
+#    # Load the pipeline, and call it with some text.
+#    nlp = spacy.en.English()
+#    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’",
+#                tag=True, parse=False)
+#    o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
+#    assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’"
+#
+#    o = nlp.vocab[u'back'].prob
+#    assert o == -7.033305644989014
+#    o = nlp.vocab[u'not'].prob
+#    assert o == -5.332601070404053
+#    o = nlp.vocab[u'quietly'].prob
+#    assert o == -11.994928359985352
+#
+#
+#@pytest.mark.m
+#def test2():
+#    import spacy.en
+#    from spacy.parts_of_speech import ADV
+#    nlp = spacy.en.English()
+#    # Find log probability of Nth most frequent word
+#    probs = [lex.prob for lex in nlp.vocab]
+#    probs.sort()
+#    is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
+#    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
+#    o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
+#    o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
+#
+#@pytest.mark.models
+#def test3():
+#    import spacy.en
+#    from spacy.parts_of_speech import ADV
+#    nlp = spacy.en.English()
+#    # Find log probability of Nth most frequent word
+#    probs = [lex.prob for lex in nlp.vocab]
+#    probs.sort()
+#    is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
+#    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
+#    o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
+#    assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
+#
+#    pleaded = tokens[7]
+#    assert pleaded.repvec.shape == (300,)
+#    o = pleaded.repvec[:5]
+#    assert sum(o) != 0
+#    from numpy import dot
+#    from numpy.linalg import norm
+#
+#    cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
+#    words = [w for w in nlp.vocab if w.is_lower and w.has_repvec]
+#    words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
+#    words.reverse()
+#    o = [w.orth_ for w in words[0:20]]
+#    assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded',
+#                 u'pleads', u'testified', u'conspired', u'motioned', u'demurred',
+#                 u'countersued', u'remonstrated', u'begged', u'apologised',
+#                 u'consented', u'acquiesced', u'petitioned', u'quarreled',
+#                 u'appealed', u'pleading']
+#    o = [w.orth_ for w in words[50:60]]
+#    assert o == [u'martialed', u'counselled', u'bragged',
+#                 u'backtracked', u'caucused', u'refiled', u'dueled', u'mused',
+#                 u'dissented', u'yearned']
+#    o = [w.orth_ for w in words[100:110]]
+#    assert o == [u'acquits', u'cabled', u'ducked', u'sentenced',
+#                 u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed',
+#                 u'clerked']
+#    
+#    #o = [w.orth_ for w in words[1000:1010]]
+#    #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled',
+#    #             u'posited', u'firebombed', u'slimed', u'deferred', u'sagged']
+#    #o = [w.orth_ for w in words[50000:50010]]
+#    #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid',
+#    #             u'dirty', u'rims', u'artists']