* Tmp

2026-01-08 17:51:16 +03:00 · 2014-12-21 05:36:29 +11:00 · 2014-12-21 05:36:29 +11:00 · e1c1a4b868
commit e1c1a4b868
parent d11c1edf8c
42 changed files with 138 additions and 2382 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,135 +0,0 @@
-from thinc.typedefs cimport atom_t
-
-from .lang cimport Language
-from .tokens cimport Tokens
-from .tokens cimport TokenC
-
-
-cpdef enum en_person_t:
-    NO_PERSON
-    FIRST
-    SECOND
-    THIRD
-    NON_THIRD
-
-
-cpdef enum en_number_t:
-    NO_NUMBER
-    SINGULAR
-    PLURAL
-    MASS
-
-
-cpdef enum en_gender_t:
-    NO_GENDER
-    MASCULINE
-    FEMININE
-    NEUTER
-
-
-cpdef enum en_case_t:
-    NO_CASE
-    NOMINATIVE
-    GENITIVE
-    ACCUSATIVE
-    REFLEXIVE
-    DEMONYM
-
-
-cpdef enum en_tenspect_t:
-    NO_TENSE
-    BASE_VERB
-    PRESENT
-    PAST
-    PASSIVE
-    ING
-    MODAL
-
-
-cpdef enum misc_t:
-    NO_MISC
-    COMPARATIVE
-    SUPERLATIVE
-    RELATIVE
-    NAME
-
-    
-# Flags
-cpdef enum FlagID:
-    IS_ALPHA
-    IS_ASCII
-    IS_DIGIT
-    IS_LOWER
-    IS_PUNCT
-    IS_SPACE
-    IS_TITLE
-    IS_UPPER
-
-    LIKE_URL
-    LIKE_NUMBER
-
-    OFT_LOWER
-    OFT_TITLE
-    OFT_UPPER
-
-    IN_MALES
-    IN_FEMALES
-    IN_SURNAMES
-    IN_PLACES
-    IN_GAMES
-    IN_CELEBS
-    IN_NAMES
-
-
-cpdef enum:
-    P2_sic
-    P2_cluster
-    P2_shape
-    P2_prefix
-    P2_suffix
-    P2_pos
-    P2_lemma
-    P2_pos_type
-
-    P1_sic
-    P1_cluster
-    P1_shape
-    P1_prefix
-    P1_suffix
-    P1_pos
-    P1_lemma
-    P1_pos_type
-
-    W_sic
-    W_cluster
-    W_shape
-    W_prefix
-    W_suffix
-    W_pos
-    W_lemma
-    W_pos_type
-
-    N1_sic
-    N1_cluster
-    N1_shape
-    N1_prefix
-    N1_suffix
-    N1_pos
-    N1_lemma
-    N1_pos_type
-
-    N2_sic
-    N2_cluster
-    N2_shape
-    N2_prefix
-    N2_suffix
-    N2_pos
-    N2_lemma
-    N2_pos_type
-
-    N_CONTEXT_FIELDS
-
-
-cdef class English(Language):
-    cdef int is_base_np_end(self, const TokenC* token) except -1
-    cdef int is_outside_base_np(self, const TokenC* token) except -1
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -1,213 +0,0 @@
-# cython: profile=True
-# cython: embedsignature=True
-'''Tokenize English text, using a scheme that differs from the Penn Treebank 3
-scheme in several important respects:
-
-* Whitespace is added as tokens, except for single spaces. e.g.,
-
-    >>> [w.string for w in EN.tokenize(u'\\nHello  \\tThere')]
-    [u'\\n', u'Hello', u' ', u'\\t', u'There']
-
-* Contractions are normalized, e.g.
-
-    >>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")]
-    [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
-  
-* Hyphenated words are split, with the hyphen preserved, e.g.:
-    
-    >>> [w.string for w in EN.tokenize(u'New York-based')]
-    [u'New', u'York', u'-', u'based']
-
-Other improvements:
-
-* Email addresses, URLs, European-formatted dates and other numeric entities not
-  found in the PTB are tokenized correctly
-* Heuristic handling of word-final periods (PTB expects sentence boundary detection
-  as a pre-process before tokenization.)
-
-Take care to ensure your training and run-time data is tokenized according to the
-same scheme. Tokenization problems are a major cause of poor performance for
-NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
-provides a fully Penn Treebank 3-compliant tokenizer.
-'''
-from __future__ import unicode_literals
-
-from murmurhash.mrmr cimport hash64
-
-cimport lang
-from .typedefs cimport hash_t, id_t, flags_t
-import orth
-from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
-from .morphology cimport X, PUNCT, EOL
-
-from .tokens cimport Morphology
-
-
-DEF USE_POS_CACHE = True
-
-
-POS_TAGS = {
-    'NULL': (NO_TAG, {}),
-    'EOL': (EOL, {}),
-    'CC': (CONJ, {}),
-    'CD': (NUM, {}),
-    'DT': (DET, {}),
-    'EX': (DET, {}),
-    'FW': (X, {}),
-    'IN': (ADP, {}),
-    'JJ': (ADJ, {}),
-    'JJR': (ADJ, {'misc': COMPARATIVE}),
-    'JJS': (ADJ, {'misc': SUPERLATIVE}),
-    'LS': (X, {}),
-    'MD': (VERB, {'tenspect': MODAL}),
-    'NN': (NOUN, {}),
-    'NNS': (NOUN, {'number': PLURAL}),
-    'NNP': (NOUN, {'misc': NAME}),
-    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
-    'PDT': (DET, {}),
-    'POS': (PRT, {'case': GENITIVE}),
-    'PRP': (NOUN, {}),
-    'PRP$': (NOUN, {'case': GENITIVE}),
-    'RB': (ADV, {}),
-    'RBR': (ADV, {'misc': COMPARATIVE}),
-    'RBS': (ADV, {'misc': SUPERLATIVE}),
-    'RP': (PRT, {}),
-    'SYM': (X, {}),
-    'TO': (PRT, {}),
-    'UH': (X, {}),
-    'VB': (VERB, {}),
-    'VBD': (VERB, {'tenspect': PAST}),
-    'VBG': (VERB, {'tenspect': ING}),
-    'VBN': (VERB, {'tenspect': PASSIVE}),
-    'VBP': (VERB, {'tenspect': PRESENT}),
-    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
-    'WDT': (DET, {'misc': RELATIVE}),
-    'WP': (PRON, {'misc': RELATIVE}),
-    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
-    'WRB': (ADV, {'misc': RELATIVE}),
-    '!': (PUNCT, {}),
-    '#': (PUNCT, {}),
-    '$': (PUNCT, {}),
-    "''": (PUNCT, {}),
-    "(": (PUNCT, {}),
-    ")": (PUNCT, {}),
-    "-LRB-": (PUNCT, {}),
-    "-RRB-": (PUNCT, {}),
-    ".": (PUNCT, {}),
-    ",": (PUNCT, {}),
-    "``": (PUNCT, {}),
-    ":": (PUNCT, {}),
-    "?": (PUNCT, {}),
-}
-
-
-POS_TEMPLATES = (
-    (W_sic,),
-    (P1_lemma, P1_pos),
-    (P2_lemma, P2_pos),
-    (N1_sic,),
-    (N2_sic,),
-
-    (W_suffix,),
-    (W_prefix,),
-
-    (P1_pos,),
-    (P2_pos,),
-    (P1_pos, P2_pos),
-    (P1_pos, W_sic),
-    (P1_suffix,),
-    (N1_suffix,),
-
-    (W_shape,),
-    (W_cluster,),
-    (N1_cluster,),
-    (N2_cluster,),
-    (P1_cluster,),
-    (P2_cluster,),
-
-    (W_pos_type,),
-    (N1_pos_type,),
-    (N1_pos_type,),
-    (P1_pos, W_pos_type, N1_pos_type),
-)
-
-
-cdef class English(Language):
-    """English tokenizer, tightly coupled to lexicon.
-
-    Attributes:
-        name (unicode): The two letter code used by Wikipedia for the language.
-        lexicon (Lexicon): The lexicon. Exposes the lookup method.
-    """
-    def get_props(self, unicode string):
-        return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
-
-    def set_flags(self, unicode string):
-        cdef flags_t flags = 0
-        flags |= orth.is_alpha(string) << IS_ALPHA
-        flags |= orth.is_ascii(string) << IS_ASCII
-        flags |= orth.is_digit(string) << IS_DIGIT
-        flags |= orth.is_lower(string) << IS_LOWER
-        flags |= orth.is_punct(string) << IS_PUNCT
-        flags |= orth.is_space(string) << IS_SPACE
-        flags |= orth.is_title(string) << IS_TITLE
-        flags |= orth.is_upper(string) << IS_UPPER
-
-        flags |= orth.like_url(string) << LIKE_URL
-        flags |= orth.like_number(string) << LIKE_NUMBER
-        return flags
-
-    def set_pos(self, Tokens tokens):
-        cdef int i
-        cdef atom_t[N_CONTEXT_FIELDS] context
-        cdef TokenC* t = tokens.data
-        cdef id_t[2] bigram
-        cdef hash_t cache_key
-        cdef void* cached = NULL
-        assert self.morphologizer is not None
-        cdef dict tagdict = self.pos_tagger.tagdict
-        for i in range(tokens.length):
-            fill_pos_context(context, i, t)
-            t[i].pos = self.pos_tagger.predict(context)
-            self.morphologizer.set_morph(i, t)
-
-    def train_pos(self, Tokens tokens, golds):
-        cdef int i
-        cdef atom_t[N_CONTEXT_FIELDS] context
-        c = 0
-        cdef TokenC* t = tokens.data
-        for i in range(tokens.length):
-            fill_pos_context(context, i, t)
-            t[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            self.morphologizer.set_morph(i, t)
-            c += t[i].pos == golds[i]
-        return c
-
-    cdef int is_base_np_end(self, const TokenC* token) except -1:
-        pass
-
-    cdef int is_outside_base_np(self, const TokenC* token) except -1:
-        pass
-
-
-
-cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
-    _fill_from_token(&context[P2_sic], &tokens[i-2])
-    _fill_from_token(&context[P1_sic], &tokens[i-1])
-    _fill_from_token(&context[W_sic], &tokens[i])
-    _fill_from_token(&context[N1_sic], &tokens[i+1])
-    _fill_from_token(&context[N2_sic], &tokens[i+2])
-
-
-cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
-    context[0] = t.lex.sic
-    context[1] = t.lex.cluster
-    context[2] = t.lex.shape
-    context[3] = t.lex.prefix
-    context[4] = t.lex.suffix
-    context[5] = t.pos
-    context[6] = t.lemma
-    context[7] = t.lex.pos_type
-
-
-EN = English('en')
--- a/spacy/index.pxd
+++ b/spacy/index.pxd
@ -1,44 +0,0 @@
-from libcpp.vector cimport vector
-from libcpp.pair cimport pair
-
-from preshed.counter cimport count_t
-from preshed.maps cimport PreshMap
-from preshed.counter cimport PreshCounter
-from cymem.cymem cimport Pool
-
-from .lang cimport Lexicon
-from .tokens cimport Tokens, TokenC
-from .typedefs cimport id_t
-from .lexeme cimport attr_id_t
-from .typedefs cimport attr_t
-from .typedefs cimport hash_t
-
-from murmurhash.mrmr cimport hash64
-
-
-ctypedef vector[pair[id_t, count_t]] count_vector_t
-
-
-cdef class Index:
-    cdef attr_id_t attr_id
-    cdef readonly attr_t max_value
-    cdef vector[count_vector_t] counts
-    
-    cpdef int count(self, Tokens tokens) except -1
-
-
-cdef class DecisionMemory:
-    cdef int n_classes
-    cdef Pool mem
-    cdef PreshCounter _counts
-    cdef PreshCounter _class_counts
-    cdef PreshMap memos
-    cdef list class_names
-    
-    cdef int inc(self, hash_t context_key, hash_t clas, count_t inc) except -1
-    cdef int find_best_class(self, count_t* counts, hash_t context_key) except -1
-
-    cdef inline int get(self, hash_t context_key) nogil:
-        return <int><size_t>self.memos.get(context_key) - 1
-
-
--- a/spacy/index.pyx
+++ b/spacy/index.pyx
@ -1,120 +0,0 @@
-"""Create a term-document matrix"""
-cimport cython
-from libc.stdint cimport int64_t
-from libc.string cimport memmove
-
-from cymem.cymem cimport Address
-
-from .lexeme cimport Lexeme, get_attr
-from .tokens cimport TokenC
-from .typedefs cimport hash_t
-
-from preshed.maps cimport MapStruct, Cell, map_get, map_set, map_init
-from murmurhash.mrmr cimport hash64
-
-
-cdef class Index:
-    def __init__(self, attr_id_t attr_id):
-        self.attr_id = attr_id
-        self.max_value = 0
-
-    cpdef int count(self, Tokens tokens) except -1:
-        cdef PreshCounter counts = PreshCounter(2 ** 8)
-        cdef attr_id_t attr_id = self.attr_id
-        cdef attr_t term
-        cdef int i
-        for i in range(tokens.length):
-            term = get_attr(tokens.data[i].lex, attr_id)
-            counts.inc(term, 1)
-            if term > self.max_value:
-                self.max_value = term
-        cdef count_t count
-        cdef count_vector_t doc_counts
-        for term, count in counts:
-            doc_counts.push_back(pair[id_t, count_t](term, count))
-        self.counts.push_back(doc_counts)
-
-
-cdef class DecisionMemory:
-    def __init__(self, class_names):
-        self.class_names = class_names
-        self.n_classes = len(class_names)
-        self.mem = Pool()
-        self._counts = PreshCounter()
-        self._class_counts = PreshCounter()
-        self.memos = PreshMap()
-
-    def load(self, loc, thresh=50):
-        cdef:
-            count_t freq
-            hash_t key
-            int clas
-        for line in open(loc):
-            freq, key, clas = [int(p) for p in line.split()]
-            if thresh == 0 or freq >= thresh:
-                self.memos.set(key, <void*>(clas+1))
-
-    def __getitem__(self, ids):
-        cdef id_t[2] context
-        context[0] = context[0]
-        context[1] = context[1]
-        cdef hash_t context_key = hash64(context, 2 * sizeof(id_t), 0)
-        cdef hash_t[2] class_context
-        class_context[0] = context_key
-        counts = {}
-        cdef id_t i
-        for i, clas in enumerate(self.clas_names):
-            class_context[1] = <hash_t>i
-            key = hash64(class_context, sizeof(hash_t) * 2, 0)
-            count = self._class_counts[key]
-            counts[clas] = count
-        return counts
-
-    @cython.cdivision(True)
-    def iter_contexts(self, float min_acc=0.99, count_t min_freq=10):
-        cdef Address counts_addr = Address(self.n_classes, sizeof(count_t))
-        cdef count_t* counts = <count_t*>counts_addr.ptr
-        cdef MapStruct* context_counts = self._counts.c_map
-        cdef hash_t context_key
-        cdef count_t context_freq
-        cdef int best_class
-        cdef float acc
-
-        cdef int i
-        for i in range(context_counts.length):
-            context_key = context_counts.cells[i].key
-            context_freq = <count_t>context_counts.cells[i].value
-            if context_key != 0 and context_freq >= min_freq:
-                best_class = self.find_best_class(counts, context_key)
-                acc = counts[best_class] / context_freq
-                if acc >= min_acc:
-                    yield counts[best_class], context_key, best_class
-        
-    cdef int inc(self, hash_t context_key, hash_t clas, count_t inc) except -1:
-        cdef hash_t context_and_class_key
-        cdef hash_t[2] context_and_class
-        context_and_class[0] = context_key
-        context_and_class[1] = clas
-        context_and_class_key = hash64(context_and_class, 2 * sizeof(hash_t), 0)
-        self._counts.inc(context_key, inc)
-        self._class_counts.inc(context_and_class_key, inc)
-
-    cdef int find_best_class(self, count_t* counts, hash_t context_key) except -1:
-        cdef hash_t[2] unhashed_key
-        unhashed_key[0] = context_key
-        
-        cdef count_t total = 0
-        cdef hash_t key
-        cdef int clas
-        cdef int best
-        cdef int mode = 0
-        for clas in range(self.n_classes):
-            unhashed_key[1] = <hash_t>clas
-            key = hash64(unhashed_key, sizeof(hash_t) * 2, 0)
-            count = self._class_counts[key]
-            counts[clas] = count
-            if count >= mode:
-                mode = count
-                best = clas
-            total += count
-        return best
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -1,90 +0,0 @@
-from os import path
-
-
-NOUN_RULES = (
-    ('s', ''),
-    ('ses', 's'),
-    ('ves', 'f'),
-    ('xes', 'x'),
-    ('zes', 'z'),
-    ('ches', 'ch'),
-    ('shes', 'sh'),
-    ('men', 'man'),
-    ('ies', 'y')
-)
-
-
-VERB_RULES = (
-    ("s", ""),
-    ("ies", "y"),
-    ("es", "e"),
-    ("es", ""),
-    ("ed", "e"),
-    ("ed", ""),
-    ("ing", "e"),
-    ("ing", "")
-)
-
-
-ADJ_RULES = (
-    ("er", ""),
-    ("est", ""),
-    ("er", "e"),
-    ("est", "e")
-)
-
-
-class Lemmatizer(object):
-    def __init__(self, wn_dict_dir):
-        self.index = {}
-        self.exc = {}
-        for pos in ['adj', 'adv', 'noun', 'verb']:
-            self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
-            self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
-
-    def noun(self, string):
-        return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
-
-    def verb(self, string):
-        return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
-
-    def adj(self, string):
-        return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
-
-
-def lemmatize(string, index, exceptions, rules):
-    string = string.lower()
-    forms = []
-    if string in index:
-        forms.append(string)
-    forms.extend(exceptions.get(string, []))
-    for old, new in rules:
-        if string.endswith(old):
-            form = string[:len(string) - len(old)] + new
-            if form in index:
-                forms.append(form)
-    if not forms:
-        forms.append(string)
-    return set(forms)
-
-
-def read_index(loc):
-    index = set()
-    for line in open(loc):
-        if line.startswith(' '):
-            continue
-        pieces = line.split()
-        word = pieces[0]
-        if word.count('_') == 0:
-            index.add(word)
-    return index
-
-
-def read_exc(loc):
-    exceptions = {}
-    for line in open(loc):
-        if line.startswith(' '):
-            continue
-        pieces = line.split()
-        exceptions[pieces[0]] = tuple(pieces[1:])
-    return exceptions
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -36,11 +36,11 @@ cdef struct _Cached:
 cdef class Morphologizer:
    """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
    """
-    def __init__(self, StringStore strings, object lemmatizer, **kwargs):
+    def __init__(self, StringStore strings, object lemmatizer,
+                 irregulars=None, tag_map=None, tag_names=None):
        self.mem = Pool()
        self.strings = strings
-        tag_map = kwargs['tag_map']
-        self.tag_names = kwargs['tag_names']
+        self.tag_names = tag_names
        self.lemmatizer = lemmatizer
        self._cache = PreshMapArray(len(self.tag_names))
        self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
@ -55,9 +55,16 @@ cdef class Morphologizer:
            self.tags[i].morph.person = props.get('person', 0)
            self.tags[i].morph.case = props.get('case', 0)
            self.tags[i].morph.misc = props.get('misc', 0)
-        #if path.exists(path.join(data_dir, 'morphs.json')):
-        #    with open(path.join(data_dir, 'morphs.json')) as file_:
-        #        self.load_exceptions(json.load(file_))
+        if irregulars is not None:
+            self.load_exceptions(irregulars)
+
+    @classmethod
+    def from_dir(cls, StringStore strings, object lemmatizer, data_dir):
+        tag_map = None
+        irregulars = None
+        tag_names = None
+        return cls(strings, lemmatizer, tag_map=tag_map, irregulars=irregulars,
+                   tag_names=tag_names)

    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
        if self.lemmatizer is None:
@ -86,7 +93,6 @@ cdef class Morphologizer:
            cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
            cached.morph = tag.morph
            self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
-
        tokens[i].lemma = cached.lemma
        tokens[i].morph = cached.morph

--- a/spacy/ner/init.pxd
+++ b/spacy/ner/init.pxd
--- a/spacy/ner/init.py
+++ b/spacy/ner/init.py
--- a/spacy/ner/_feats.pxd
+++ b/spacy/ner/_feats.pxd
--- a/spacy/ner/_feats.pyx
+++ b/spacy/ner/_feats.pyx
@ -1,169 +0,0 @@
-from spacy.context cimport FIELD_IDS, Token
-
-
-cdef Token P4 = FIELD_IDS.P4
-cdef Token P3 = FIELD_IDS.P3
-cdef Token P2 = FIELD_IDS.P2
-cdef Token P1 = FIELD_IDS.P1
-cdef Token N0 = FIELD_IDS.N0
-cdef Token N1 = FIELD_IDS.N1
-cdef Token N2 = FIELD_IDS.N2
-cdef Token N3 = FIELD_IDS.N3
-cdef Token N4 = FIELD_IDS.N4
-
-"""
-TEMPLATES = (
-    (N0.sic,),
-    (N0.cluster,),
-
-    (P1.pos,),
-    (P1.sic,),
-
-    (N1.norm,),
-    (N1.pos,),
-
-    (P1.ner,),
-    (P2.ner,),
-
-    (N0.cluster,),
-    (P1.cluster,),
-    (N1.cluster,),
-
-    (N0.is_alpha,),
-    (N0.is_digit,),
-    (N0.is_title,),
-    (N0.is_upper,),
-
-    (N0.is_title, N0.oft_title),
-    (N0.is_upper, N0.oft_upper),
-
-    (P1.cluster, N0.norm),
-    (N0.norm, N1.cluster),
-
-    (P1.ner, N0.pos),
-    (P2.ner, P1.ner, N0.pos),
-
-    (P2.pos, P1.pos, N0.sic),
-    (N0.sic, N1.pos, N2.pos)
-)
-"""
-
-LOCAL = (
-    (N0.sic,),
-    (P1.sic,),
-    (N1.sic,),
-    (P2.sic,),
-    (N2.sic,),
-    (P3.sic,),
-    (N3.sic,),
-    (P4.sic,),
-    (N4.sic,),
-    
-    (P1.sic, N0.sic,),
-    (N0.sic, N1.sic),
-    
-    (N0.prefix,),
-    (N0.suffix,),
-
-    (P1.shape,),
-    (N0.shape,),
-    (N1.shape,),
-    (P1.shape, N0.shape,),
-    (N0.shape, P1.shape,),
-    (P1.shape, N0.shape, N1.shape),
-    (N2.shape,),
-    (P2.shape,),
-    (P3.shape,),
-    (N3.shape,),
-    (P4.shape,),
-    (N4.shape,),
-
-    (P2.norm, P1.norm, N0.norm),
-    (P1.norm, N0.norm, N1.norm),
-    (N0.norm, N1.norm, N2.norm)
-)
-
-BOOLS = (
-    (N0.is_title,),
-)
-
-
-HISTORY = (
-    (P1.ner,),
-    (P1.ner, N0.sic,),
-    (P2.ner,),
-    (P2.ner, P1.ner),
-    (P2.ner, P1.ner, N0.sic),
-    (P2.pos, P1.ner, N0.pos),
-    (P2.ner, P1.pos, N0.pos),
-    (P3.ner,),
-    (P4.ner,),
-)
-
-POS = (
-    (P4.pos,),
-    (P3.pos,),
-    (P2.pos,),
-    (P1.pos,),
-    (N0.pos,),
-    (N1.pos,),
-    (N2.pos,),
-    (N3.pos,),
-    (N4.pos,),
-
-    (P1.pos, N0.pos),
-    (N0.pos, N1.pos),
-    (P2.pos, P1.pos, N0.pos),
-    (P1.pos, N0.pos, N1.pos),
-    (N0.pos, N1.pos, N2.pos)
-)
-
-CLUSTERS = (
-    (P4.cluster,),
-    (P3.cluster,),
-    (P2.cluster,),
-    (P1.cluster,),
-    (N0.cluster,),
-    (N1.cluster,),
-    (N2.cluster,),
-    (N3.cluster,),
-    (N4.cluster,),
-
-    (P1.cluster, N0.cluster),
-    (N0.cluster, N1.cluster),
-)
-
-
-CLUSTER_POS = (
-    (P1.cluster, N0.pos),
-    (N0.pos, P1.cluster),
-    (N0.cluster, N1.pos),
-    (N0.pos, N1.cluster)
-)
-
-
-GAZ = (
-    (N0.in_males,),
-    (N0.in_females,),
-    (N0.in_surnames,),
-    (N0.in_places,),
-    (N0.in_games,),
-    (N0.in_celebs,),
-    (N0.in_names,),
-    (P1.in_males,),
-    (P1.in_females,),
-    (P1.in_surnames,),
-    (P1.in_places,),
-    (P1.in_games,),
-    (P1.in_celebs,),
-    (P1.in_names,),
-    (N1.in_males,),
-    (N1.in_females,),
-    (N1.in_surnames,),
-    (N1.in_places,),
-    (N1.in_games,),
-    (N1.in_celebs,),
-    (N1.in_names,),
-)
-
-TEMPLATES = LOCAL + HISTORY + CLUSTERS + POS + CLUSTER_POS + GAZ + BOOLS
--- a/spacy/ner/_state.pxd
+++ b/spacy/ner/_state.pxd
@ -1,15 +0,0 @@
-from cymem.cymem cimport Pool
-from .structs cimport State, Entity, Move
-
-cdef int begin_entity(State* s, label) except -1
-
-cdef int end_entity(State* s) except -1
-
-cdef State* init_state(Pool mem, int sent_length) except NULL
-cdef int copy_state(Pool mem, State* dest, State* source) except -1
-
-cdef bint entity_is_open(State *s) except -1
-
-cdef int entity_is_sunk(State *s, Move* golds) except -1
-
-cdef int is_done(State* s) except -1
--- a/spacy/ner/_state.pyx
+++ b/spacy/ner/_state.pyx
@ -1,54 +0,0 @@
-from libc.string cimport memcpy
-
-
-cdef int begin_entity(State* s, label) except -1:
-    s.j += 1
-    s.ents[s.j].start = s.i
-    s.ents[s.j].tag = label
-    s.ents[s.j].end = s.i + 1
-
-
-cdef int end_entity(State* s) except -1:
-    s.ents[s.j].end = s.i + 1
-
-
-cdef State* init_state(Pool mem, int sent_length) except NULL:
-    s = <State*>mem.alloc(1, sizeof(State))
-    s.ents = <Entity*>mem.alloc(sent_length, sizeof(Entity))
-    s.tags = <int*>mem.alloc(sent_length, sizeof(int))
-    s.length = sent_length
-
-
-cdef bint entity_is_open(State *s) except -1:
-    return s.ents[s.j].start != 0
-
-
-cdef int entity_is_sunk(State *s, Move* golds) except -1:
-    if not entity_is_open(s):
-        return False
-    raise StandardError
-    #cdef Entity* ent = &s.ents[s.j]
-    #cdef Move* gold = &golds[ent.start]
-    #if gold.action != BEGIN and gold.action != UNIT:
-    #    return True
-    #elif gold.label != ent.label:
-    #    return True
-    #else:
-    #    return False
-
-
-cdef int copy_state(Pool mem, State* dest, State* source) except -1:
-    '''Copy state source into state dest.'''
-    if source.length > dest.length:
-        dest.ents = <Entity*>mem.realloc(dest.ents, source.length * sizeof(Entity))
-        dest.tags = <int*>mem.realloc(dest.tags, source.length * sizeof(int))
-    memcpy(dest.ents, source.ents, source.length * sizeof(Entity))
-    memcpy(dest.tags, source.tags, source.length * sizeof(int))
-    dest.length = source.length
-    dest.i = source.i
-    dest.j = source.j
-    dest.curr = source.curr
-
-
-cdef int is_done(State* s) except -1:
-    return s.i >= s.length and not entity_is_open(s)
--- a/spacy/ner/annot.pxd
+++ b/spacy/ner/annot.pxd
@ -1,8 +0,0 @@
-from cymem.cymem cimport Pool
-
-cdef class NERAnnotation:
-    cdef Pool mem
-    cdef int* starts
-    cdef int* ends
-    cdef int* labels
-    cdef readonly list entities
--- a/spacy/ner/annot.pyx
+++ b/spacy/ner/annot.pyx
@ -1,94 +0,0 @@
-from libc.string cimport memset
-
-
-cdef class NERAnnotation:
-    def __init__(self, entities, length, entity_types):
-        self.mem = Pool()
-        self.starts = <int*>self.mem.alloc(length, sizeof(int))
-        self.ends = <int*>self.mem.alloc(length, sizeof(int))
-        self.labels = <int*>self.mem.alloc(length, sizeof(int))
-        self.entities = entities
-        memset(self.starts, -1, sizeof(int) * length)
-        memset(self.ends, -1, sizeof(int) * length)
-        memset(self.labels, -1, sizeof(int) * length)
-        
-        cdef int start, end, label
-        for start, end, label in entities:
-            for i in range(start, end):
-                self.starts[i] = start
-                self.ends[i] = end
-                self.labels[i] = label
-
-    @classmethod
-    def from_bilous(cls, tag_strs, entity_types):
-        entities = []
-        start = None
-        for i, tag_str in enumerate(tag_strs):
-            if tag_str == 'O' or tag_str == '-':
-                continue
-            move, label_str = tag_str.split('-')
-            label = entity_types.index(label_str)
-            if label == -1:
-                label = len(entity_types)
-                entity_types.append(label)
-            if move == 'U':
-                assert start is None
-                entities.append((i, i+1, label))
-            elif move == 'B':
-                assert start is None
-                start = i
-            elif move == 'L':
-                assert start is not None
-                entities.append((start, i+1, label))
-                start = None
-        return cls(entities, len(tag_strs), entity_types)
-
-
-
-def read_iob(file_, entity_types, create_tokens):
-    sent_strs = file_.read().strip().split('\n\n')
-    sents = []
-    for sent_str in sent_strs:
-        if sent_str.startswith('-DOCSTART-'):
-            continue
-        words = []
-        iob = []
-        for token_str in sent_str.split('\n'):
-            word, pos, chunk, ner = token_str.split()
-            words.append(word)
-            iob.append(ner)
-        bilou = iob_to_bilou(iob)
-        tokens = create_tokens(words)
-        sents.append((tokens, NERAnnotation.from_bilous(bilou, entity_types)))
-    return sents
-
-
-def iob_to_bilou(tags):
-    out = []
-    curr_label = None
-    tags = list(tags)
-    while tags:
-        out.extend(_consume_os(tags))
-        out.extend(_consume_ent(tags))
-    return out
-
-def _consume_os(tags):
-    while tags and tags[0] == 'O':
-        yield tags.pop(0)
-
-def _consume_ent(tags):
-    if not tags:
-        return []
-    target = tags.pop(0).replace('B', 'I')
-    length = 1
-    while tags and tags[0] == target:
-        length += 1
-        tags.pop(0)
-    label = target[2:]
-    if length == 1:
-        return ['U-' + label]
-    else:
-        start = 'B-' + label
-        end = 'L-' + label
-        middle = ['I-%s' % label for _ in range(1, length - 1)]
-        return [start] + middle + [end]
--- a/spacy/ner/bilou_moves.pxd
+++ b/spacy/ner/bilou_moves.pxd
@ -1,27 +0,0 @@
-from cymem.cymem cimport Pool
-
-from thinc.typedefs cimport class_t
-from thinc.typedefs cimport weight_t
-
-from .structs cimport State, Move
-
-
-cpdef enum ActionType:
-    MISSING
-    BEGIN
-    IN
-    LAST
-    UNIT
-    OUT
-    N_ACTIONS
-
-
-cdef int set_accept_if_oracle(Move* moves, Move* golds, int n, State* s) except 0
-
-cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
-
-cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
-
-cdef int transition(State *s, Move* m) except -1
-
-cdef int fill_moves(Move* moves, list tag_names) except -1
--- a/spacy/ner/bilou_moves.pyx
+++ b/spacy/ner/bilou_moves.pyx
@ -1,207 +0,0 @@
-from __future__ import unicode_literals
-
-from ._state cimport begin_entity
-from ._state cimport end_entity
-from ._state cimport entity_is_open
-from ._state cimport entity_is_sunk
-
-
-ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
-ACTION_NAMES[<int>MISSING] = '?'
-ACTION_NAMES[<int>BEGIN] = 'B'
-ACTION_NAMES[<int>IN] = 'I'
-ACTION_NAMES[<int>LAST] = 'L'
-ACTION_NAMES[<int>UNIT] = 'U'
-ACTION_NAMES[<int>OUT] = 'O'
-
-
-cdef bint can_begin(State* s, int label):
-    return not entity_is_open(s)
-
-
-cdef bint can_in(State* s, int label):
-    return entity_is_open(s) and s.curr.label == label
-
-
-cdef bint can_last(State* s, int label):
-    return entity_is_open(s) and s.curr.label == label
-
-
-cdef bint can_unit(State* s, int label):
-    return not entity_is_open(s)
-
-
-cdef bint can_out(State* s, int label):
-    return not entity_is_open(s)
-
-
-cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag,
-                    ActionType next_act, bint is_sunk):
-    if g_act == MISSING:
-        return True
-    if act == BEGIN:
-        if g_act == BEGIN:
-            # B, Gold B --> Label match
-            return tag == g_tag
-        else:
-            # B, Gold I --> False (P)
-            # B, Gold L --> False (P)
-            # B, Gold O --> False (P)
-            # B, Gold U --> False (P)
-            return False
-    elif act == IN:
-        if g_act == BEGIN:
-            # I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
-            return True
-        elif g_act == IN:
-            # I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
-            return True
-        elif g_act == LAST:
-            # I, Gold L --> True iff this entity sunk and next tag == O
-            return is_sunk and (next_act == OUT or next_act == MISSING)
-        elif g_act == OUT:
-            # I, Gold O --> True iff next tag == O
-            return next_act == OUT or next_act == MISSING
-        elif g_act == UNIT:
-            # I, Gold U --> True iff next tag == O
-            return next_act == OUT
-    elif act == LAST:
-        if g_act == BEGIN:
-            # L, Gold B --> True
-            return True
-        elif g_act == IN:
-            # L, Gold I --> True iff this entity sunk
-            return is_sunk
-        elif g_act == LAST:
-            # L, Gold L --> True
-            return True
-        elif g_act == OUT:
-            # L, Gold O --> True
-            return True
-        elif g_act == UNIT:
-            # L, Gold U --> True
-            return True
-    elif act == OUT:
-        if g_act == BEGIN:
-            # O, Gold B --> False
-            return False
-        elif g_act == IN:
-            # O, Gold I --> True
-            return True
-        elif g_act == LAST:
-            # O, Gold L --> True
-            return True
-        elif g_act == OUT:
-            # O, Gold O --> True
-            return True
-        elif g_act == UNIT:
-            # O, Gold U --> False
-            return False
-    elif act == UNIT:
-        if g_act == UNIT:
-            # U, Gold U --> True iff tag match
-            return tag == g_tag
-        else:
-            # U, Gold B --> False
-            # U, Gold I --> False
-            # U, Gold L --> False
-            # U, Gold O --> False
-            return False
-    
-
-cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0:
-    cdef int n_accept = 0
-    cdef Move* m
-    moves[0].accept = False
-    for i in range(1, n_classes):
-        m = &moves[i]
-        if m.action == BEGIN:
-            m.accept = can_begin(s, m.label)
-        elif m.action == IN:
-            m.accept = can_in(s, m.label)
-        elif m.action == LAST:
-            m.accept = can_last(s, m.label)
-        elif m.action == UNIT:
-            m.accept = can_unit(s, m.label)
-        elif m.action == OUT:
-            m.accept = can_out(s, m.label)
-        n_accept += m.accept
-    assert n_accept != 0
-    return n_accept
-
-
-cdef int set_accept_if_oracle(Move* moves, Move* golds, int n_classes, State* s) except 0:
-
-    cdef Move* g = &golds[s.i]
-    cdef ActionType next_act = <ActionType>golds[s.i+1].action if s.i < s.length else OUT
-    cdef bint is_sunk = entity_is_sunk(s, golds)
-    cdef Move* m
-    cdef int n_accept = 0
-    set_accept_if_valid(moves, n_classes, s)
-    for i in range(1, n_classes):
-        m = &moves[i]
-        if not m.accept:
-            continue
-        m.accept = is_oracle(<ActionType>m.action, m.label, <ActionType>g.action,
-                             g.label, next_act, is_sunk)
-        n_accept += m.accept
-    assert n_accept != 0
-    return n_accept
-
-
-cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
-    cdef int first_accept = -1
-    for first_accept in range(1, n):
-        if moves[first_accept].accept:
-            break
-    else:
-        raise StandardError
-    assert first_accept != -1
-    cdef int best = first_accept
-    cdef weight_t score = scores[first_accept-1]
-    cdef int i
-    for i in range(first_accept+1, n): 
-        if moves[i].accept and scores[i-1] > score:
-            best = i
-            score = scores[i-1]
-    return &moves[best]
-
-
-cdef int transition(State *s, Move* move) except -1:
-    if move.action == BEGIN:
-        begin_entity(s, move.label)
-    elif move.action == IN:
-        pass
-    elif move.action == LAST:
-        end_entity(s)
-    elif move.action == UNIT:
-        begin_entity(s, move.label)
-        end_entity(s)
-    elif move.action == OUT:
-        pass
-    s.tags[s.i] = move.clas 
-    s.i += 1
-
-
-def get_n_moves(n_tags):
-    return n_tags + n_tags + n_tags + n_tags + 1
-
-
-cdef int fill_moves(Move* moves, list tag_names) except -1:
-    cdef Move* m
-    label_names = {'-': 0}
-    for i, tag_name in enumerate(tag_names):
-        m = &moves[i]
-        if '-' in tag_name:
-            action_str, label = tag_name.split('-')
-        elif tag_name == 'O':
-            action_str = 'O'
-            label = '-'
-        elif tag_name == 'NULL' or tag_name == 'EOL':
-            action_str = '?'
-            label = '-'
-        else:
-            raise StandardError(tag_name)
-        m.action = ACTION_NAMES.index(action_str)
-        m.label = label_names.setdefault(label, len(label_names))
-        m.clas = i
--- a/spacy/ner/context.pxd
+++ b/spacy/ner/context.pxd
@ -1,155 +0,0 @@
-from thinc.typedefs cimport atom_t
-from ..typedefs cimport hash_t
-from ..tokens cimport Tokens
-from ..lexeme cimport Lexeme
-from .structs cimport State
-
-
-cpdef enum:
-    T_sic
-    T_cluster
-    T_norm
-    T_shape
-    T_asciied
-    T_prefix
-    T_suffix
-    T_length
-    T_postype
-    T_nertype
-    T_sensetype
-    T_is_alpha
-    T_is_ascii
-    T_is_digit
-    T_is_lower
-    T_is_punct
-    T_is_space
-    T_is_title
-    T_is_upper
-    T_like_url
-    T_like_number
-    T_oft_lower
-    T_oft_title
-    T_oft_upper
-    T_in_males
-    T_in_females
-    T_in_surnames
-    T_in_places
-    T_in_celebs
-    T_in_names
-    T_pos
-    T_sense
-    T_ner
-
-
-cpdef enum:
-    P2_sic
-    P2_cluster
-    P2_norm
-    P2_shape
-    P2_prefix
-    P2_suffix
-    P2_length
-    P2_postype
-    P2_is_alpha
-    P2_is_digit
-    P2_is_lower
-    P2_is_punct
-    P2_is_title
-    P2_is_upper
-    P2_like_number
-    P2_pos
-
-    P1_sic
-    P1_cluster
-    P1_norm
-    P1_shape
-    P1_prefix
-    P1_suffix
-    P1_length
-    P1_postype
-    P1_is_alpha
-    P1_is_digit
-    P1_is_lower
-    P1_is_punct
-    P1_is_title
-    P1_is_upper
-    P1_like_number
-    P1_pos
-
-    W_sic
-    W_cluster
-    W_norm
-    W_shape
-    W_prefix
-    W_suffix
-    W_length
-    W_postype
-    W_is_alpha
-    W_is_digit
-    W_is_lower
-    W_is_punct
-    W_is_space
-    W_is_title
-    W_is_upper
-    W_like_number
-    W_pos
-
-    N1_sic
-    N1_cluster
-    N1_norm
-    N1_shape
-    N1_prefix
-    N1_suffix
-    N1_length
-    N1_postype
-    N1_is_alpha
-    N1_is_ascii
-    N1_is_digit
-    N1_is_lower
-    N1_is_punct
-    N1_is_space
-    N1_is_title
-    N1_is_upper
-    N1_like_number
-    N1_pos
-
-    N2_sic
-    N2_cluster
-    N2_norm
-    N2_shape
-    N2_asciied
-    N2_prefix
-    N2_suffix
-    N2_length
-    N2_postype
-    N2_is_alpha
-    N2_is_digit
-    N2_is_lower
-    N2_is_punct
-    N2_is_space
-    N2_is_title
-    N2_is_upper
-    N2_like_number
-    N2_pos
-    N2_sense
-
-    E_label
-
-    E0_sic
-    E0_cluster
-    E0_pos
-
-    E1_sic
-    E1_cluster
-    E1_pos
-
-    E_last_sic
-    E_last_cluster
-    E_last_pos
-
-    N_FIELDS
-
-
-cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1
-
-
--- a/spacy/ner/context.pyx
+++ b/spacy/ner/context.pyx
@ -1,77 +0,0 @@
-from libc.string cimport memset
-
-from murmurhash.mrmr cimport hash64
-from ._state cimport entity_is_open
-from ..lexeme cimport *
-
-
-cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos):
-    c[T_sic] = lex.sic
-    c[T_cluster] = lex.cluster
-    c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
-    c[T_shape] = lex.shape
-    c[T_asciied] = lex.asciied
-    c[T_prefix] = lex.prefix
-    c[T_suffix] = lex.suffix
-    c[T_length] = lex.length
-
-    c[T_postype] = lex.postype
-    c[T_nertype] = 0
-    c[T_sensetype] = 0
-    
-    c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)
-    c[T_is_digit] = lex.flags & (1 << IS_DIGIT)
-    c[T_is_lower] = lex.flags & (1 << IS_LOWER)
-    c[T_is_punct] = lex.flags & (1 << IS_PUNCT)
-    c[T_is_space] = lex.flags & (1 << IS_SPACE)
-    c[T_is_title] = lex.flags & (1 << IS_TITLE)
-    c[T_is_upper] = lex.flags & (1 << IS_UPPER)
-    c[T_like_url] = lex.flags & (1 << LIKE_URL)
-    c[T_like_number] = lex.flags & (1 << LIKE_NUMBER)
-    c[T_oft_lower] = lex.flags & (1 << OFT_LOWER)
-    c[T_oft_title] = lex.flags & (1 << OFT_TITLE)
-    c[T_oft_upper] = lex.flags & (1 << OFT_UPPER)
-
-    c[T_in_males] = lex.flags & (1 << IN_MALES)
-    c[T_in_females] = lex.flags & (1 << IN_FEMALES)
-    c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES)
-    c[T_in_places] = lex.flags & (1 << IN_PLACES)
-    c[T_in_celebs] = lex.flags & (1 << IN_CELEBS)
-    c[T_in_names] = lex.flags & (1 << IN_NAMES)
-
-    c[T_pos] = pos
-    c[T_sense] = 0
-
-
-cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos):
-    c[0] = lex.sic
-    c[1] = lex.cluster
-    c[2] = lex.shape
-    c[3] = pos
-
-
-cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1:
-    cdef int i
-    for i in range(N_FIELDS):
-        context[i] = 0
-    i = s.i
-    _fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2])
-    _fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1])
-    _fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i])
-    _fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1])
-    _fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2])
-
-    cdef atom_t[5] ent_vals
-    if entity_is_open(s):
-        context[E_label] = s.curr.label
-        context[E0_sic] = tokens.lex[s.curr.start].sic
-        context[E0_cluster] = tokens.lex[s.curr.start].cluster
-        context[E0_pos] = tokens.pos[s.curr.start]
-        context[E_last_sic] = tokens.lex[s.i-1].sic
-        context[E_last_cluster] = tokens.lex[s.i-1].cluster
-        context[E_last_pos] = tokens.pos[s.i-1]
-        if (s.curr.start + 1) < s.i:
-            context[E1_sic] = tokens.lex[s.curr.start+1].sic
-            context[E1_cluster] = tokens.lex[s.curr.start+1].cluster
-            context[E1_pos] = tokens.pos[s.curr.start+1]
-    return 1
--- a/spacy/ner/feats.pxd
+++ b/spacy/ner/feats.pxd
--- a/spacy/ner/feats.pyx
+++ b/spacy/ner/feats.pyx
@ -1,107 +0,0 @@
-from .context import *
-
-
-LOCAL = (
-    (W_sic,),
-    (P1_sic,),
-    (N1_sic,),
-    (P2_sic,),
-    (N2_sic,),
-    
-    (P1_sic, W_sic,),
-    (W_sic, N1_sic),
-    
-    (W_prefix,),
-    (W_suffix,),
-
-    (P1_shape,),
-    (W_shape,),
-    (N1_shape,),
-    (P1_shape, W_shape,),
-    (W_shape, P1_shape,),
-    (P1_shape, W_shape, N1_shape),
-    (N2_shape,),
-    (P2_shape,),
-
-    (P2_norm, P1_norm, W_norm),
-    (P1_norm, W_norm, N1_norm),
-    (W_norm, N1_norm, N2_norm)
-)
-
-POS = (
-    (P2_pos,),
-    (P1_pos,),
-    (W_pos,),
-    (N1_pos,),
-    (N2_pos,),
-
-    (P1_pos, W_pos),
-    (W_pos, N1_pos),
-    (P2_pos, P1_pos, W_pos),
-    (P1_pos, W_pos, N1_pos),
-    (W_pos, N1_pos, N2_pos)
-)
-
-CLUSTERS = (
-    (P2_cluster,),
-    (P1_cluster,),
-    (W_cluster,),
-    (N1_cluster,),
-    (N2_cluster,),
-
-    (P1_cluster, W_cluster),
-    (W_cluster, N1_cluster),
-)
-
-
-CLUSTER_POS = (
-    (P1_cluster, W_pos),
-    (W_pos, P1_cluster),
-    (W_cluster, N1_pos),
-    (W_pos, N1_cluster)
-)
-
-
-STATE = (
-   (E0_sic,),
-   (E0_cluster,),
-   (E0_pos,),
-   (E_last_sic,),
-   (E_last_cluster,),
-   (E_last_pos,),
-
-   (E0_sic, W_sic),
-   (E0_cluster, W_cluster),
-   (E0_pos, W_pos),
-   (E_last_sic, W_sic),
-   (E_last_pos, W_pos),
-
-   (E0_pos, E_last_pos, W_pos),
-   (E0_cluster, E_last_cluster, W_cluster),
-
-   (E0_sic, E_last_sic),
-   (E0_pos, E_last_pos),
-   (E0_cluster, E_last_cluster),
-   (E0_pos, E_last_cluster),
-   (E0_cluster, E_last_pos),
-
-   (E1_sic,),
-   (E1_cluster,),
-   (E1_pos,),
-
-   (E0_sic, E1_sic),
-   (E0_sic, E1_pos,),
-   (E0_pos, E1_sic,),
-   (E0_pos, E1_pos),
-
-   (E_label,),
-   (E_label, W_sic),
-   (E_label, W_pos),
-   (E_label, W_cluster),
-   (E_label, W_shape),
-   (E_label, E_last_sic),
-   (E_label, E0_pos, E_last_pos),
-)
-
-
-TEMPLATES = LOCAL + CLUSTERS + POS + CLUSTER_POS + STATE
--- a/spacy/ner/greedy_parser.pxd
+++ b/spacy/ner/greedy_parser.pxd
@ -1,29 +0,0 @@
-from cymem.cymem cimport Pool
-from thinc.features cimport Extractor
-from thinc.learner cimport LinearModel
-from thinc.typedefs cimport *
-
-from ..tokens cimport Tokens
-from ..typedefs cimport *
-
-from .structs cimport Move
-from .annot cimport NERAnnotation
-
-
-cdef class NERParser:
-    cdef Pool mem
-    cdef Extractor extractor
-    cdef LinearModel model
-    cdef readonly list tag_names
-    cdef readonly list entity_types
-    cdef readonly int n_classes
-
-    cdef Move* _moves
-    cdef atom_t* _context
-    cdef feat_t* _feats
-    cdef weight_t* _values
-    cdef weight_t* _scores
-
-
-    cpdef list train(self, Tokens tokens, NERAnnotation annot)
-    cpdef list set_tags(self, Tokens tokens)
--- a/spacy/ner/greedy_parser.pyx
+++ b/spacy/ner/greedy_parser.pyx
@ -1,81 +0,0 @@
-cimport cython
-import random
-import os
-from os import path
-import shutil
-import json
-
-from thinc.features cimport ConjFeat
-
-from ..context cimport fill_context
-from ..context cimport N_FIELDS
-from .moves cimport Move
-from .moves cimport fill_moves, transition, best_accepted
-from .moves cimport set_accept_if_valid, set_accept_if_oracle
-from .moves import get_n_moves
-from ._state cimport State
-from ._state cimport init_state
-
-
-cdef class NERParser:
-    def __init__(self, model_dir):
-        self.mem = Pool()
-        cfg = json.load(open(path.join(model_dir, 'config.json')))
-        templates = cfg['templates']
-        self.entity_types = cfg['entity_types']
-        self.extractor = Extractor(templates, [ConjFeat] * len(templates))
-        self.n_classes = get_n_moves(len(self.entity_types))
-        self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
-        fill_moves(self._moves, len(self.entity_types))
-        self.model = LinearModel(len(self.tag_names))
-        if path.exists(path.join(model_dir, 'model')):
-            self.model.load(path.join(model_dir, 'model'))
-
-        self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
-        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
-        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
-        self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
-
-    cpdef int train(self, Tokens tokens, gold_classes):
-        cdef Pool mem = Pool()
-        cdef State* s = init_state(mem, tokens.length)
-        cdef Move* golds = <Move*>mem.alloc(len(gold_classes), sizeof(Move))
-        for i, clas in enumerate(gold_classes):
-            golds[i] = self.moves[clas - 1]
-            assert golds[i].id == clas
-        cdef Move* guess
-        while s.i < tokens.length:
-            fill_context(self._context, s.i, tokens)
-            self.extractor.extract(self._feats, self._values, self._context, NULL)
-            self.model.score(self._scores, self._feats, self._values)
-            
-            set_accept_if_valid(self._moves, self.n_classes, s)
-            guess = best_accepted(self._moves, self._scores, self.n_classes)
-
-            set_accept_if_oracle(self._moves, golds, self.n_classes, s) # TODO
-            gold = best_accepted(self._moves, self._scores, self.n_classes)
-
-            if guess.clas == gold.clas:
-                self.model.update({})
-                return 0
-
-            counts = {guess.clas: {}, gold.clas: {}}
-            self.extractor.count(counts[gold.clas], self._feats, 1)
-            self.extractor.count(counts[guess.clas], self._feats, -1)
-            self.model.update(counts)
-
-            transition(s, guess)
-            tokens.ner[s.i-1] = s.tags[s.i-1]
-
-    cpdef int set_tags(self, Tokens tokens) except -1:
-        cdef Pool mem = Pool()
-        cdef State* s = init_state(mem, tokens.length)
-        cdef Move* move
-        while s.i < tokens.length:
-            fill_context(self._context, s.i, tokens)
-            self.extractor.extract(self._feats, self._values, self._context, NULL)
-            self.model.score(self._scores, self._feats, self._values)
-            set_accept_if_valid(self._moves, self.n_classes, s)
-            move = best_accepted(self._moves, self._scores, self.n_classes)
-            transition(s, move)
-            tokens.ner[s.i-1] = s.tags[s.i-1]
--- a/spacy/ner/io_moves.pxd
+++ b/spacy/ner/io_moves.pxd
@ -1,26 +0,0 @@
-from cymem.cymem cimport Pool
-
-from thinc.typedefs cimport class_t
-from thinc.typedefs cimport weight_t
-
-from .structs cimport State, Move
-
-
-cpdef enum ActionType:
-    MISSING
-    SHIFT
-    REDUCE
-    OUT
-    N_ACTIONS
-
-
-cdef int set_accept_if_oracle(Move* moves, int n, State* s,
-                              int* g_starts, int* g_ends, int* g_labels) except 0
- 
-cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
-
-cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
-
-cdef int transition(State *s, Move* m) except -1
-
-cdef int fill_moves(Move* moves, int n, list entity_types) except -1
--- a/spacy/ner/io_moves.pyx
+++ b/spacy/ner/io_moves.pyx
@ -1,161 +0,0 @@
-from __future__ import unicode_literals
-from cymem.cymem cimport Pool
-
-from thinc.typedefs cimport class_t
-from thinc.typedefs cimport weight_t
-
-from ._state cimport begin_entity
-from ._state cimport end_entity
-from ._state cimport entity_is_open
-
-
-ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
-ACTION_NAMES[<int>MISSING] = '?'
-ACTION_NAMES[<int>SHIFT] = 'S'
-ACTION_NAMES[<int>REDUCE] = 'R'
-ACTION_NAMES[<int>OUT] = 'O'
-
-
-cdef int set_accept_if_oracle(Move* moves, int n, State* s,
-                              int* g_starts, int* g_ends, int* g_labels) except 0:
-    # If curr entity: (O invalid)
-    #   if cost is not sunk (start matches, end is i-1 or greater
-    #     - If i-1 == gold.end --> R=True, S=False
-    #     - Shift if end >= i --> S=True, R=False
-    #   else
-    #     - If i == gold.start --> R=True, S=False
-    #     - Else --> R=True, S=True
-    # Else (R invalid):
-    #   if start == gold.start: S=True, O=False
-    #   else: O=True, S=False
-    if entity_is_open(s):
-        g_start = g_starts[s.curr.start]
-        g_end = g_ends[s.curr.start]
-        accept_o = False
-        if g_start == s.curr.start and g_end == s.i:
-            accept_r = True
-            accept_s = False
-        elif g_start == s.curr.start and g_end > s.i:
-            accept_s = True
-            s_label = s.curr.label
-            accept_r = False
-        elif g_starts[s.i] == s.i:
-            accept_r = True
-            accept_s = False
-        else:
-            accept_r = True
-            accept_s = True
-            s_label = s.curr.label
-    else:
-        accept_r = False
-        if g_starts[s.i] == s.i:
-            accept_s = True
-            s_label = g_labels[s.i]
-            accept_o = False
-        else:
-            accept_o = True
-            accept_s = False
-    n_accept = 0
-    moves[0].accept = False
-    for i in range(1, n):
-        m = &moves[i]
-        if m.action == SHIFT:
-            m.accept = accept_s and m.label == s_label
-        elif m.action == REDUCE:
-            m.accept = accept_r
-        elif m.action == OUT:
-            m.accept = accept_o
-        n_accept += m.accept
-    assert n_accept != 0
-    return n_accept
-
-
-cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0:
-    cdef int i
-    cdef bint open_ent = entity_is_open(s)
-    cdef int n_accept = 0
-    moves[0].accept = False
-    for i in range(1, n):
-        if moves[i].action == SHIFT:
-            if s.i >= s.length:
-                moves[i].accept = False
-            elif open_ent and moves[i].label != s.curr.label:
-                moves[i].accept = False
-            else:
-                moves[i].accept = True
-        elif moves[i].action == REDUCE:
-            moves[i].accept = open_ent
-        elif moves[i].action == OUT:
-            moves[i].accept = s.i < s.length and not open_ent
-        n_accept += moves[i].accept
-    return n_accept
-
-
-cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
-    cdef int first_accept = -1
-    for first_accept in range(1, n):
-        if moves[first_accept].accept:
-            break
-    else:
-        raise StandardError
-    assert first_accept != -1
-    cdef int best = first_accept
-    cdef weight_t score = scores[first_accept-1]
-    cdef int i
-    for i in range(first_accept+1, n): 
-        if moves[i].accept and scores[i-1] > score:
-            best = i
-            score = scores[i-1]
-    return &moves[best]
-
-
-cdef int transition(State *s, Move* move) except -1:
-    s.tags[s.i] = move.clas 
-    if move.action == OUT:
-        s.i += 1
-    elif move.action == SHIFT:
-        if not entity_is_open(s):
-            s.curr.start = s.i
-            s.curr.label = move.label
-        s.i += 1
-    elif move.action == REDUCE:
-        s.curr.end = s.i
-        s.ents[s.j] = s.curr
-        s.j += 1
-        s.curr.start = 0
-        s.curr.label = -1
-        s.curr.end = 0
-    else:
-        raise ValueError(move.action)
-
-
-def get_n_moves(n_tags):
-    return 1 + 1 + 1 + n_tags
-
-
-cdef int fill_moves(Move* moves, int n, list entity_types) except -1:
-    cdef Move* m
-    label_names = {'-': 0}
-    # Reserve class 0
-    cdef int i = 0
-    moves[i].clas = i
-    moves[i].action = MISSING
-    moves[i].label = 0
-    i += 1
-    for entity_type in entity_types:
-        moves[i].action = SHIFT
-        moves[i].label = label_names.setdefault(entity_type, len(label_names))
-        moves[i].clas = i
-        i += 1
-    moves[i].clas = i
-    moves[i].action = OUT
-    moves[i].label = 0
-    i += 1
-    moves[i].action = REDUCE
-    moves[i].clas = i
-    moves[i].label = 0
-    i += 1
-
-
-cdef bint is_final(State* s):
-    return s.i == s.length and not entity_is_open(s)
--- a/spacy/ner/pystate.pxd
+++ b/spacy/ner/pystate.pxd
@ -1,16 +0,0 @@
-from cymem.cymem cimport Pool
-
-from .structs cimport Move, State
-
-
-cdef class PyState:
-    cdef Pool mem
-    cdef readonly list tag_names
-    cdef readonly int n_classes
-    cdef readonly dict moves_by_name
-    
-    cdef Move* _moves
-    cdef Move* _golds
-    cdef State* _s
-
-    cdef Move* _get_move(self, unicode move_name) except NULL
--- a/spacy/ner/pystate.pyx
+++ b/spacy/ner/pystate.pyx
@ -1,60 +0,0 @@
-from __future__ import unicode_literals
-
-from ._state cimport init_state
-from ._state cimport entity_is_open
-from .bilou_moves cimport fill_moves
-from .bilou_moves cimport transition
-from .bilou_moves cimport set_accept_if_valid, set_accept_if_oracle
-from .bilou_moves import get_n_moves
-from .bilou_moves import ACTION_NAMES
-
-
-cdef class PyState:
-    def __init__(self, tag_names, n_tokens):
-        self.mem = Pool()
-        self.tag_names = tag_names
-        self.n_classes = len(tag_names)
-        assert self.n_classes != 0
-        self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
-        fill_moves(self._moves, tag_names)
-        self._s = init_state(self.mem, n_tokens)
-        self._golds = <Move*>self.mem.alloc(n_tokens, sizeof(Move))
-
-    cdef Move* _get_move(self, unicode move_name) except NULL:
-        return &self._moves[self.tag_names.index(move_name)]
-
-    def set_golds(self, list gold_names):
-        cdef Move* m
-        for i, name in enumerate(gold_names):
-            m = self._get_move(name)
-            self._golds[i] = m[0]
-
-    def transition(self, unicode move_name):
-        cdef Move* m = self._get_move(move_name)
-        transition(self._s, m)
-
-    def is_valid(self, unicode move_name):
-        cdef Move* m = self._get_move(move_name)
-        set_accept_if_valid(self._moves, self.n_classes, self._s)
-        return m.accept
-
-    def is_gold(self, unicode move_name):
-        cdef Move* m = self._get_move(move_name)
-        set_accept_if_oracle(self._moves, self._golds, self.n_classes, self._s)
-        return m.accept
-
-    property ent:
-        def __get__(self):
-            return self._s.curr
-
-    property n_ents:
-        def __get__(self):
-            return self._s.j
-
-    property i:
-        def __get__(self):
-            return self._s.i
-
-    property open_entity:
-        def __get__(self):
-            return entity_is_open(self._s)
--- a/spacy/ner/structs.pxd
+++ b/spacy/ner/structs.pxd
@ -1,23 +0,0 @@
-from thinc.typedefs cimport class_t
-
-
-cdef struct Entity:
-    int start
-    int end
-    int label
-
-
-cdef struct State:
-    Entity curr
-    Entity* ents
-    int* tags
-    int i
-    int j
-    int length
-
-
-cdef struct Move:
-    class_t clas
-    int action
-    int label
-    bint accept
--- a/spacy/pos_feats.pxd
+++ b/spacy/pos_feats.pxd
--- a/spacy/pos_feats.pyx
+++ b/spacy/pos_feats.pyx
@ -1,41 +0,0 @@
-from spacy.context cimport FIELD_IDS, Token
-
-
-cpdef Token P2 = FIELD_IDS.P2
-cpdef Token P1 = FIELD_IDS.P1
-cpdef Token N0 = FIELD_IDS.N0
-cpdef Token N1 = FIELD_IDS.N1
-cpdef Token N2 = FIELD_IDS.N2
-
-
-TEMPLATES = (
-    (N0.sic,),
-    (N0.norm,),
-    (N0.suffix,),
-    (N0.prefix,),
-    (P1.pos,),
-    (P2.pos,),
-    (P1.pos, P2.pos),
-    (P1.pos, N0.norm),
-    (P1.norm,),
-    (P1.suffix,),
-    (P2.norm,),
-    (N1.norm,),
-    (N1.suffix,),
-    (N2.norm,),
-
-    (N0.shape,),
-    (N0.cluster,),
-    (N1.cluster,),
-    (N2.cluster,),
-    (P1.cluster,),
-    (P2.cluster,),
-    (N0.oft_upper,),
-    (N0.oft_title,),
-
-    (N0.postype,),
-
-    (P1.like_url,),
-    (N1.like_number,),
-    (N1.like_url,),
-)
--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@ -1,153 +0,0 @@
-from __future__ import unicode_literals
-from . import util
-from . import tokens
-from .en import EN
-
-
-def read_gold(file_, tag_list, col):
-    paras = file_.read().strip().split('\n\n')
-    golds = []
-    tag_ids = dict((tag, i) for i, tag in enumerate(tag_list))
-    for para in paras:
-        if not para.strip():
-            continue
-        lines = para.strip().split('\n')
-        raw = lines.pop(0)
-        gold_toks = lines.pop(0)
-        tokens = EN.tokenize(raw)
-        tags = []
-        conll_toks = []
-        for line in lines:
-            pieces = line.split()
-            conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[col]))
-        for i, token in enumerate(tokens):
-            if not conll_toks:
-                tags.append('NULL')
-            elif token.idx == conll_toks[0][0]:
-                tags.append(conll_toks[0][2])
-                conll_toks.pop(0)
-            elif token.idx < conll_toks[0]:
-                tags.append('NULL')
-            else:
-                conll_toks.pop(0)
-        assert len(tags) == len(tokens)
-        tags = [_encode_pos(t, tag_ids, tag_list) for t in tags]
-        golds.append((tokens, tags))
-    return golds
-
-def _encode_pos(tag, tag_ids, tag_list):
-    if tag == '-':
-        return 0
-    if tag not in tag_ids:
-        tag_ids[tag] = len(tag_list)
-        tag_list.append(tag)
-    return tag_ids[tag]
-
-
-def ptb_to_univ(tag):
-    mapping = dict(tuple(line.split()) for line in """
-NULL    NULL
-HYPH   .
-ADD X
-NFP .
-AFX X
-XX  X
-BES VERB
-HVS VERB
-GW  X
-!	.
-#	.
-$	.
-''	.
-(	.
-)	.
-,	.
-LRB-	.
-RRB-	.
-.	.
-:	.
-?	.
-CC	CONJ
-CD	NUM
-CD|RB	X
-DT	DET
-EX	DET
-FW	X
-IN	ADP
-IN|RP	ADP
-JJ	ADJ
-JJR	ADJ
-JJRJR	ADJ
-JJS	ADJ
-JJ|RB	ADJ
-JJ|VBG	ADJ
-LS	X
-MD	VERB
-NN	NOUN
-NNP	NOUN
-NNPS	NOUN
-NNS	NOUN
-NN|NNS	NOUN
-NN|SYM	NOUN
-NN|VBG	NOUN
-NP	NOUN
-PDT	DET
-POS	PRT
-PRP	PRON
-PRP$	PRON
-PRP|VBP	PRON
-PRT	PRT
-RB	ADV
-RBR	ADV
-RBS	ADV
-RB|RP	ADV
-RB|VBG	ADV
-RN	X
-RP	PRT
-SYM	X
-TO	PRT
-UH	X
-VB	VERB
-VBD	VERB
-VBD|VBN	VERB
-VBG	VERB
-VBG|NN	VERB
-VBN	VERB
-VBP	VERB
-VBP|TO	VERB
-VBZ	VERB
-VP	VERB
-WDT	DET
-WH	X
-WP	PRON
-WP$	PRON
-WRB	ADV
-!	PRT
-#	X
-$	NUM
-&	CONJ
-,	.
-@	X
-A	ADJ
-D	DET
-E	X
-G	X
-L	PRT
-M	PRT
-N	NOUN
-O	PRON
-P	ADP
-R	ADV
-S	NOUN
-T	PRT
-U	X
-V	VERB
-X	PRT
-Y	PRT
-Z	NOUN
-^	NOUN
-~	X
-``	.
-EOL EOL""".strip().split('\n'))
-    return mapping[tag]
-
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -8,7 +8,7 @@ from .structs cimport Utf8Str, UniStr
 cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
    s.chars = &chars[start]
    s.n = end - start
-    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
+    s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)


 cdef class StringStore:
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -1,6 +1,6 @@
 from libc.stdint cimport uint8_t, uint32_t

-from .typedefs cimport flags_t, attr_t, id_t, hash_t
+from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t


 cdef struct Lexeme:
@ -34,7 +34,7 @@ cdef struct Morphology:
 cdef struct PosTag:
    Morphology morph
    int id
-    int pos
+    univ_tag_t pos


 cdef struct TokenC:
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -2,7 +2,7 @@ from libc.stdint cimport uint32_t

 from cymem.cymem cimport Pool

-from ..tokens cimport TokenC
+from ..structs cimport TokenC


 cdef struct State:
@ -20,7 +20,8 @@ cdef int pop_stack(State *s) except -1
 cdef int push_stack(State *s) except -1


-cdef bint has_head(const TokenC* t) nogil
+cdef inline bint has_head(const TokenC* t) nogil:
+    return t.head != 0


 cdef inline int get_idx(const State* s, const TokenC* t) nogil:
@ -70,29 +71,14 @@ cdef inline bint is_final(const State *s) nogil:
    return at_eol(s) # The stack will be attached to root anyway


-cdef int children_in_buffer(const State *s, const int head, const int* gold) except -1
-cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1
-cdef int children_in_stack(const State *s, const int head, const int* gold) except -1
-cdef int head_in_stack(const State *s, const int child, const int* gold) except -1
+cdef int children_in_buffer(const State *s, const int head, int* gold) except -1
+cdef int head_in_buffer(const State *s, const int child, int* gold) except -1
+cdef int children_in_stack(const State *s, const int head, int* gold) except -1
+cdef int head_in_stack(const State *s, const int child, int* gold) except -1

 cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL


-cdef int count_left_kids(const TokenC* head) nogil
-
-
-cdef int count_right_kids(const TokenC* head) nogil
-
-
-# From https://en.wikipedia.org/wiki/Hamming_weight
-cdef inline uint32_t _popcount(uint32_t x) nogil:
-    """Find number of non-zero bits."""
-    cdef int count = 0
-    while x != 0:
-        x &= x - 1
-        count += 1
-    return count
-

 cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
    cdef int i
--- a/spacy/syntax/_state.pyx
+++ b/spacy/syntax/_state.pyx
@ -3,32 +3,24 @@ from libc.string cimport memmove
 from cymem.cymem cimport Pool

 from ..lexeme cimport EMPTY_LEXEME
-from ..tokens cimport TokenC
-
-
-DEF PADDING = 5
-DEF NON_MONOTONIC = True


 cdef int add_dep(State *s, int head, int child, int label) except -1:
-    cdef int dist = head - child
-    s.sent[child].head = dist
+    s.sent[child].head = head - child
    s.sent[child].dep_tag = label
    # Keep a bit-vector tracking child dependencies.  If a word has a child at
    # offset i from it, set that bit (tracking left and right separately)
    if child > head:
-        s.sent[head].r_kids |= 1 << (-dist)
+        s.sent[head].r_kids |= 1 << (-s.sent[child].head)
    else:
-        s.sent[head].l_kids |= 1 << dist
+        s.sent[head].l_kids |= 1 << s.sent[child].head


 cdef int pop_stack(State *s) except -1:
    assert s.stack_len >= 1
    s.stack_len -= 1
    s.stack -= 1
-    if s.stack_len == 0 and not at_eol(s):
-        push_stack(s)
-        
+

 cdef int push_stack(State *s) except -1:
    assert s.i < s.sent_len
@ -36,14 +28,9 @@ cdef int push_stack(State *s) except -1:
    s.stack[0] = s.i
    s.stack_len += 1
    s.i += 1
-    if at_eol(s):
-        while s.stack_len != 0:
-            if not has_head(get_s0(s)):
-                get_s0(s).dep_tag = 0
-            pop_stack(s)


-cdef int children_in_buffer(const State *s, int head, const int* gold) except -1:
+cdef int children_in_buffer(const State *s, int head, int* gold) except -1:
    # Golds holds an array of head offsets --- the head of word i is i - golds[i]
    # Iterate over the tokens of the queue, and check whether their gold head is
    # our target
@ -55,21 +42,20 @@ cdef int children_in_buffer(const State *s, int head, const int* gold) except -1
    return n


-cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1:
+cdef int head_in_buffer(const State *s, const int child, int* gold) except -1:
    return gold[child] >= s.i


-cdef int children_in_stack(const State *s, const int head, const int* gold) except -1:
+cdef int children_in_stack(const State *s, const int head, int* gold) except -1:
    cdef int i
    cdef int n = 0
    for i in range(s.stack_len):
        if gold[s.stack[-i]] == head:
-            if NON_MONOTONIC or not has_head(get_s0(s)):
-                n += 1
+            n += 1
    return n


-cdef int head_in_stack(const State *s, const int child, const int* gold) except -1:
+cdef int head_in_stack(const State *s, const int child, int* gold) except -1:
    cdef int i
    for i in range(s.stack_len):
        if gold[child] == s.stack[-i]:
@ -86,7 +72,7 @@ cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) n
    if child >= s.sent:
        return child
    else:
-        return NULL
+        return s.sent - 1


 cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil:
@ -98,20 +84,10 @@ cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx)
    if child < (s.sent + s.sent_len):
        return child
    else:
-        return NULL
+        return s.sent - 1


-cdef bint has_head(const TokenC* t) nogil:
-    return t.head != 0
-
-
-cdef int count_left_kids(const TokenC* head) nogil:
-    return _popcount(head.l_kids)
-
-
-cdef int count_right_kids(const TokenC* head) nogil:
-    return _popcount(head.r_kids)
-
+DEF PADDING = 5


 cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL:
@ -126,5 +102,4 @@ cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NUL
    s.stack_len = 0
    s.i = 0
    s.sent_len = sent_length
-    push_stack(s)
    return s
--- a/spacy/syntax/arc_eager.pxd
+++ b/spacy/syntax/arc_eager.pxd
@ -7,11 +7,8 @@ from ._state cimport State


 cdef struct Transition:
-    int clas
    int move
    int label
-    int cost
-    weight_t score


 cdef class TransitionSystem:
@ -21,8 +18,7 @@ cdef class TransitionSystem:

    cdef const Transition* _moves

-    cdef Transition best_valid(self, const weight_t* scores, const State* s) except *
-    cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
-                              const State* s,
-                              const int* gold_heads, const int* gold_labels) except *
+    cdef Transition best_valid(self, const weight_t* scores, const State* s) except -1
+    cdef Transition best_gold(self, const weight_t* scores, const State* s,
+                              int* gold_heads, int* gold_labels) except -1
    cdef int transition(self, State *s, const Transition* t) except -1
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -7,8 +7,6 @@ from ._state cimport head_in_stack, children_in_stack

 from ..tokens cimport TokenC

-DEF NON_MONOTONIC = True
-

 cdef enum:
    SHIFT
@ -27,30 +25,22 @@ cdef inline bint _can_right(const State* s) nogil:


 cdef inline bint _can_left(const State* s) nogil:
-    if NON_MONOTONIC:
-        return s.stack_len >= 1
-    else:
-        return s.stack_len >= 1 and not has_head(get_s0(s))
+    return s.stack_len >= 1 and not has_head(get_s0(s))


 cdef inline bint _can_reduce(const State* s) nogil:
-    if NON_MONOTONIC:
-        return s.stack_len >= 2
-    else:
-        return s.stack_len >= 2 and has_head(get_s0(s))
+    return s.stack_len >= 2 and has_head(get_s0(s))


-cdef int _shift_cost(const State* s, const int* gold) except -1:
+cdef int _shift_cost(const State* s, int* gold) except -1:
    assert not at_eol(s)
    cost = 0
    cost += head_in_stack(s, s.i, gold)
    cost += children_in_stack(s, s.i, gold)
-    if NON_MONOTONIC:
-        cost += gold[s.stack[0]] == s.i
    return cost


-cdef int _right_cost(const State* s, const int* gold) except -1:
+cdef int _right_cost(const State* s, int* gold) except -1:
    assert s.stack_len >= 1
    cost = 0
    if gold[s.i] == s.stack[0]:
@ -58,12 +48,10 @@ cdef int _right_cost(const State* s, const int* gold) except -1:
    cost += head_in_buffer(s, s.i, gold)
    cost += children_in_stack(s, s.i, gold)
    cost += head_in_stack(s, s.i, gold)
-    if NON_MONOTONIC:
-        cost += gold[s.stack[0]] == s.i
    return cost


-cdef int _left_cost(const State* s, const int* gold) except -1:
+cdef int _left_cost(const State* s, int* gold) except -1:
    assert s.stack_len >= 1
    cost = 0
    if gold[s.stack[0]] == s.i:
@ -71,17 +59,11 @@ cdef int _left_cost(const State* s, const int* gold) except -1:

    cost += head_in_buffer(s, s.stack[0], gold)
    cost += children_in_buffer(s, s.stack[0], gold)
-    if NON_MONOTONIC and s.stack_len >= 2:
-        cost += gold[s.stack[0]] == s.stack[-1]
    return cost


-cdef int _reduce_cost(const State* s, const int* gold) except -1:
-    cdef int cost = 0
-    cost += children_in_buffer(s, s.stack[0], gold)
-    if NON_MONOTONIC:
-        cost += head_in_buffer(s, s.stack[0], gold)
-    return cost
+cdef int _reduce_cost(const State* s, int* gold) except -1:
+    return children_in_buffer(s, s.stack[0], gold)


 cdef class TransitionSystem:
@ -91,40 +73,38 @@ cdef class TransitionSystem:
        right_labels.sort()
        if 'ROOT' in right_labels:
            right_labels.pop(right_labels.index('ROOT'))
+        if 'dep' in right_labels:
+            right_labels.pop(right_labels.index('dep'))
        if 'ROOT' in left_labels:
            left_labels.pop(left_labels.index('ROOT'))
+        if 'dep' in left_labels:
+            left_labels.pop(left_labels.index('dep'))
        self.n_moves = 2 + len(left_labels) + len(right_labels) 
        moves = <Transition*>self.mem.alloc(self.n_moves, sizeof(Transition))
        cdef int i = 0
        moves[i].move = SHIFT
        moves[i].label = 0
-        moves[i].clas = i
        i += 1
        moves[i].move = REDUCE
        moves[i].label = 0
-        moves[i].clas = i
        i += 1
-        self.label_ids = {'ROOT': 0}
+        self.label_ids = {'ROOT': 0, 'dep': -1}
        cdef int label_id
        for label_str in left_labels:
            label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
            moves[i].move = LEFT
            moves[i].label = label_id
-            moves[i].clas = i
            i += 1
        for label_str in right_labels:
            label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
            moves[i].move = RIGHT
            moves[i].label = label_id
-            moves[i].clas = i
            i += 1
        self._moves = moves

-    cdef int transition(self, State *s, const Transition* t) except -1:
+    cdef int transition(self, State *s, const int clas) except -1:
+        cdef const Transition* t = &self._moves[clas]
        if t.move == SHIFT:
-            # Set the dep label, in case we need it after we reduce
-            if NON_MONOTONIC:
-                get_s0(s).dep_tag = t.label
            push_stack(s)
        elif t.move == LEFT:
            add_dep(s, s.i, s.stack[0], t.label)
@ -133,12 +113,11 @@ cdef class TransitionSystem:
            add_dep(s, s.stack[0], s.i, t.label)
            push_stack(s)
        elif t.move == REDUCE:
-            add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep_tag)
            pop_stack(s)
        else:
            raise StandardError(t.move)

-    cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
+    cdef int best_valid(self, const weight_t* scores, const State* s) except -1:
        cdef bint[N_MOVES] valid
        valid[SHIFT] = _can_shift(s)
        valid[LEFT] = _can_left(s)
@ -147,61 +126,59 @@ cdef class TransitionSystem:

        cdef int best = -1
        cdef weight_t score = 0
-        cdef weight_t best_r_score = -9000
-        cdef int best_r_label = -1
        cdef int i
        for i in range(self.n_moves):
            if valid[self._moves[i].move] and (best == -1 or scores[i] > score):
                best = i
                score = scores[i]
-            if self._moves[i].move == RIGHT and scores[i] > best_r_score:
-                best_r_label = self._moves[i].label
        assert best >= 0
-        cdef Transition t = self._moves[best]
-        t.score = score
-        if t.move == SHIFT:
-            t.label = best_r_label
-        return t
+        return best

-    cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
-                              const State* s,
-                              const int* gold_heads, const int* gold_labels) except *:
-        # If we can create a gold dependency, only one action can be correct
+    cdef int best_gold(self, const weight_t* scores, const State* s,
+                       int* gold_heads, int* gold_labels) except -1:
        cdef int[N_MOVES] unl_costs
        unl_costs[SHIFT] = _shift_cost(s, gold_heads) if _can_shift(s) else -1
        unl_costs[LEFT] = _left_cost(s, gold_heads) if _can_left(s) else -1
        unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1
        unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1

-        guess.cost = unl_costs[guess.move]
-        cdef Transition t
-        cdef int target_label
-        cdef int i
-        if gold_heads[s.stack[0]] == s.i:
-            target_label = gold_labels[s.stack[0]]
-            if guess.move == LEFT:
-                guess.cost += guess.label != target_label
-            for i in range(self.n_moves):
-                t = self._moves[i]
-                if t.move == LEFT and t.label == target_label:
-                    return t
-        elif gold_heads[s.i] == s.stack[0]:
-            target_label = gold_labels[s.i]
-            if guess.move == RIGHT:
-                guess.cost += guess.label != target_label
-            for i in range(self.n_moves):
-                t = self._moves[i]
-                if t.move == RIGHT and t.label == target_label:
-                    return t
-
+        cdef int cost
+        cdef int move
+        cdef int label
        cdef int best = -1
        cdef weight_t score = -9000
+        cdef int i
        for i in range(self.n_moves):
-            t = self._moves[i]
-            if unl_costs[t.move] == 0 and (best == -1 or scores[i] > score):
-                best = i
-                score = scores[i]
-        t = self._moves[best]
-        t.score = score
-        assert best >= 0
-        return t
+            move = self._moves[i].move
+            label = self._moves[i].label
+            if unl_costs[move] == 0: 
+                if move == SHIFT or move == REDUCE:
+                    cost = 0
+                elif move == LEFT:
+                    if gold_heads[s.stack[0]] == s.i and gold_labels[s.stack[0]] != -1:
+                        cost = label != gold_labels[s.stack[0]]
+                    else:
+                        cost = 0
+                elif move == RIGHT:
+                    if gold_heads[s.i] == s.stack[0] and gold_labels[s.i] != -1:
+                        cost = label != gold_labels[s.i]
+                    else:
+                        cost = 0
+                else:
+                    raise StandardError("Unknown Move")
+                if cost == 0 and (best == -1 or scores[i] > score):
+                    best = i
+                    score = scores[i]
+ 
+        if best < 0:
+            print unl_costs[SHIFT], unl_costs[REDUCE], unl_costs[LEFT], unl_costs[RIGHT]
+            print s.stack_len
+            print has_head(get_s0(s))
+            print s.sent[s.stack[0]].head
+            print s.stack[0], s.i
+            print gold_heads[s.stack[0]], gold_heads[s.i]
+            print gold_labels[s.i]
+            print children_in_buffer(s, s.stack[0], gold_heads)
+            print head_in_buffer(s, s.stack[0], gold_heads)
+            raise StandardError 
+        return best
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -2,6 +2,8 @@
 # cython: embedsignature=True
 from __future__ import unicode_literals

+from os import path
+
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc

@ -28,6 +30,17 @@ cdef class Tokenizer:
        self.vocab = Vocab(self.get_props)
        self._load_special_tokenization(rules)

+    @classmethod
+    def from_dir(cls, Vocab vocab, object data_dir):
+        if not path.exists(data_dir):
+            raise IOError("Directory %s not found -- cannot load Tokenizer." % data_dir)
+        if not path.isdir(data_dir):
+            raise IOError("Path %s is a file, not a dir -- cannot load Tokenizer." % data_dir)
+ 
+        assert path.exists(data_dir) and path.isdir(data_dir)
+        rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir)
+        return cls(vocab, rules, prefix_re, suffix_re, infix_re)
+
    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
        cdef Tokens tokens = Tokens(self.vocab.strings, length)
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -1,6 +1,26 @@
 from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
 from libc.stdint cimport uint8_t

+
+# Google universal tag set
+cpdef enum univ_tag_t:
+    NO_TAG
+    ADJ
+    ADV
+    ADP
+    CONJ
+    DET
+    NOUN
+    NUM
+    PRON
+    PRT
+    VERB
+    X
+    PUNCT
+    EOL
+    N_UNIV_TAGS
+
+
 ctypedef uint64_t hash_t
 ctypedef char* utf8_t
 ctypedef uint32_t attr_t
@ -10,11 +30,3 @@ ctypedef uint16_t len_t
 ctypedef uint16_t tag_t


-cdef struct Morphology:
-    uint8_t number
-    uint8_t tenspect # Tense/aspect/voice
-    uint8_t mood
-    uint8_t gender
-    uint8_t person
-    uint8_t case
-    uint8_t misc
--- a/spacy/utf8string.pxd
+++ b/spacy/utf8string.pxd
@ -1,34 +0,0 @@
-from preshed.maps cimport PreshMap
-from cymem.cymem cimport Pool
-from murmurhash.mrmr cimport hash64
-
-from .typedefs cimport utf8_t, id_t, hash_t
-
-
-cdef struct Utf8Str:
-    id_t i
-    hash_t key
-    utf8_t chars
-    int length
-
-
-cdef struct UniStr:
-    Py_UNICODE* chars
-    size_t n
-    hash_t key
-
-
-cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
-    s.chars = &chars[start]
-    s.n = end - start
-    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
-
-
-cdef class StringStore:
-    cdef Pool mem
-    cdef PreshMap _map
-    cdef Utf8Str* strings
-    cdef int size
-    cdef int _resize_at
-    
-    cdef const Utf8Str* intern(self, char* chars, int length) except NULL
--- a/spacy/utf8string.pyx
+++ b/spacy/utf8string.pyx
@ -1,80 +0,0 @@
-from libc.string cimport memcpy
-
-from murmurhash.mrmr cimport hash64
-import codecs
-
-SEPARATOR = '\n|-SEP-|\n'
-
-
-cdef class StringStore:
-    def __init__(self):
-        self.mem = Pool()
-        self._map = PreshMap()
-        self._resize_at = 10000
-        self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
-        self.size = 1
-
-    property size:
-        def __get__(self):
-            return self.size-1
-
-    def __getitem__(self, object string_or_id):
-        cdef bytes byte_string
-        cdef const Utf8Str* utf8str
-        if isinstance(string_or_id, int) or isinstance(string_or_id, long):
-            if string_or_id < 1 or string_or_id >= self.size:
-                raise IndexError(string_or_id)
-            utf8str = &self.strings[<int>string_or_id]
-            return utf8str.chars[:utf8str.length]
-        elif isinstance(string_or_id, bytes):
-            utf8str = self.intern(<char*>string_or_id, len(string_or_id))
-            return utf8str.i
-        elif isinstance(string_or_id, unicode):
-            byte_string = string_or_id.encode('utf8')
-            utf8str = self.intern(<char*>byte_string, len(byte_string))
-            return utf8str.i
-        else:
-            raise TypeError(type(string_or_id))
-
-    cdef const Utf8Str* intern(self, char* chars, int length) except NULL:
-        # 0 means missing, but we don't bother offsetting the index. We waste
-        # slot 0 to simplify the code, because it doesn't matter.
-        assert length != 0
-        cdef hash_t key = hash64(chars, length * sizeof(char), 0)
-        cdef void* value = self._map.get(key)
-        cdef size_t i
-        if value == NULL:
-            if self.size == self._resize_at:
-                self._resize_at *= 2
-                self.strings = <Utf8Str*>self.mem.realloc(self.strings, self._resize_at * sizeof(Utf8Str))
-            i = self.size
-            self.strings[i].i = self.size
-            self.strings[i].key = key
-            self.strings[i].chars = <char*>self.mem.alloc(length, sizeof(char))
-            memcpy(self.strings[i].chars, chars, length)
-            self.strings[i].length = length
-            self._map.set(key, <void*>self.size)
-            self.size += 1
-        else:
-            i = <size_t>value
-        return &self.strings[i]
-
-    def dump(self, loc):
-        strings = []
-        cdef Utf8Str* string
-        cdef bytes py_string
-        for i in range(self.size):
-            string = &self.strings[i]
-            py_string = string.chars[:string.length]
-            strings.append(py_string.decode('utf8'))
-        with codecs.open(loc, 'w', 'utf8') as file_:
-            file_.write(SEPARATOR.join(strings))
-
-    def load(self, loc):
-        with codecs.open(loc, 'r', 'utf8') as file_:
-            strings = file_.read().split(SEPARATOR)
-        cdef unicode string
-        cdef bytes byte_string
-        for string in strings[1:]:
-            byte_string = string.encode('utf8')
-            self.intern(byte_string, len(byte_string))
--- a/spacy/util.py
+++ b/spacy/util.py
@ -11,8 +11,7 @@ def utf8open(loc, mode='r'):
    return codecs.open(loc, mode, 'utf8')


-def read_lang_data(name):
-    data_dir = path.join(DATA_DIR, name)
+def read_lang_data(data_dir):
    with open(path.join(data_dir, 'specials.json')) as file_:
        tokenization = ujson.load(file_)
    prefix = read_prefix(data_dir)
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -19,6 +19,17 @@ cdef class Vocab:
        self.lexemes.push_back(&EMPTY_LEXEME)
        self.get_lex_props = get_props

+    @classmethod
+    def from_dir(cls, object data_dir, object get_lex_props=None):
+        if not path.exists(data_dir):
+            raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
+        if not path.isdir(data_dir):
+            raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
+        cdef Vocab self = cls(get_props)
+        self.strings.load(path.join(data_dir, 'strings'))
+        self.load(path.join(data_dir, 'lexemes'))
+        return self
+
    def __len__(self):
        return self.lexemes.size()