* Tmp

2025-09-18 10:02:40 +03:00 · 2014-12-21 05:36:29 +11:00 · 2014-12-21 05:36:29 +11:00 · e1c1a4b868
commit e1c1a4b868
parent d11c1edf8c
42 changed files with 138 additions and 2382 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,135 +0,0 @@
 from thinc.typedefs cimport atom_t
 from .lang cimport Language
 from .tokens cimport Tokens
 from .tokens cimport TokenC
 cpdef enum en_person_t:
    NO_PERSON
    FIRST
    SECOND
    THIRD
    NON_THIRD
 cpdef enum en_number_t:
    NO_NUMBER
    SINGULAR
    PLURAL
    MASS
 cpdef enum en_gender_t:
    NO_GENDER
    MASCULINE
    FEMININE
    NEUTER
 cpdef enum en_case_t:
    NO_CASE
    NOMINATIVE
    GENITIVE
    ACCUSATIVE
    REFLEXIVE
    DEMONYM
 cpdef enum en_tenspect_t:
    NO_TENSE
    BASE_VERB
    PRESENT
    PAST
    PASSIVE
    ING
    MODAL
 cpdef enum misc_t:
    NO_MISC
    COMPARATIVE
    SUPERLATIVE
    RELATIVE
    NAME
 # Flags
 cpdef enum FlagID:
    IS_ALPHA
    IS_ASCII
    IS_DIGIT
    IS_LOWER
    IS_PUNCT
    IS_SPACE
    IS_TITLE
    IS_UPPER
    LIKE_URL
    LIKE_NUMBER
    OFT_LOWER
    OFT_TITLE
    OFT_UPPER
    IN_MALES
    IN_FEMALES
    IN_SURNAMES
    IN_PLACES
    IN_GAMES
    IN_CELEBS
    IN_NAMES
 cpdef enum:
    P2_sic
    P2_cluster
    P2_shape
    P2_prefix
    P2_suffix
    P2_pos
    P2_lemma
    P2_pos_type
    P1_sic
    P1_cluster
    P1_shape
    P1_prefix
    P1_suffix
    P1_pos
    P1_lemma
    P1_pos_type
    W_sic
    W_cluster
    W_shape
    W_prefix
    W_suffix
    W_pos
    W_lemma
    W_pos_type
    N1_sic
    N1_cluster
    N1_shape
    N1_prefix
    N1_suffix
    N1_pos
    N1_lemma
    N1_pos_type
    N2_sic
    N2_cluster
    N2_shape
    N2_prefix
    N2_suffix
    N2_pos
    N2_lemma
    N2_pos_type
    N_CONTEXT_FIELDS
 cdef class English(Language):
    cdef int is_base_np_end(self, const TokenC* token) except -1
    cdef int is_outside_base_np(self, const TokenC* token) except -1
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -1,213 +0,0 @@
 # cython: profile=True
 # cython: embedsignature=True
 '''Tokenize English text, using a scheme that differs from the Penn Treebank 3
 scheme in several important respects:
 * Whitespace is added as tokens, except for single spaces. e.g.,
    >>> [w.string for w in EN.tokenize(u'\\nHello  \\tThere')]
    [u'\\n', u'Hello', u' ', u'\\t', u'There']
 * Contractions are normalized, e.g.
    >>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")]
    [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
 * Hyphenated words are split, with the hyphen preserved, e.g.:
    >>> [w.string for w in EN.tokenize(u'New York-based')]
    [u'New', u'York', u'-', u'based']
 Other improvements:
 * Email addresses, URLs, European-formatted dates and other numeric entities not
  found in the PTB are tokenized correctly
 * Heuristic handling of word-final periods (PTB expects sentence boundary detection
  as a pre-process before tokenization.)
 Take care to ensure your training and run-time data is tokenized according to the
 same scheme. Tokenization problems are a major cause of poor performance for
 NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
 provides a fully Penn Treebank 3-compliant tokenizer.
 '''
 from __future__ import unicode_literals
 from murmurhash.mrmr cimport hash64
 cimport lang
 from .typedefs cimport hash_t, id_t, flags_t
 import orth
 from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 from .morphology cimport X, PUNCT, EOL
 from .tokens cimport Morphology
 DEF USE_POS_CACHE = True
 POS_TAGS = {
    'NULL': (NO_TAG, {}),
    'EOL': (EOL, {}),
    'CC': (CONJ, {}),
    'CD': (NUM, {}),
    'DT': (DET, {}),
    'EX': (DET, {}),
    'FW': (X, {}),
    'IN': (ADP, {}),
    'JJ': (ADJ, {}),
    'JJR': (ADJ, {'misc': COMPARATIVE}),
    'JJS': (ADJ, {'misc': SUPERLATIVE}),
    'LS': (X, {}),
    'MD': (VERB, {'tenspect': MODAL}),
    'NN': (NOUN, {}),
    'NNS': (NOUN, {'number': PLURAL}),
    'NNP': (NOUN, {'misc': NAME}),
    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
    'PDT': (DET, {}),
    'POS': (PRT, {'case': GENITIVE}),
    'PRP': (NOUN, {}),
    'PRP$': (NOUN, {'case': GENITIVE}),
    'RB': (ADV, {}),
    'RBR': (ADV, {'misc': COMPARATIVE}),
    'RBS': (ADV, {'misc': SUPERLATIVE}),
    'RP': (PRT, {}),
    'SYM': (X, {}),
    'TO': (PRT, {}),
    'UH': (X, {}),
    'VB': (VERB, {}),
    'VBD': (VERB, {'tenspect': PAST}),
    'VBG': (VERB, {'tenspect': ING}),
    'VBN': (VERB, {'tenspect': PASSIVE}),
    'VBP': (VERB, {'tenspect': PRESENT}),
    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
    'WDT': (DET, {'misc': RELATIVE}),
    'WP': (PRON, {'misc': RELATIVE}),
    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
    'WRB': (ADV, {'misc': RELATIVE}),
    '!': (PUNCT, {}),
    '#': (PUNCT, {}),
    '$': (PUNCT, {}),
    "''": (PUNCT, {}),
    "(": (PUNCT, {}),
    ")": (PUNCT, {}),
    "-LRB-": (PUNCT, {}),
    "-RRB-": (PUNCT, {}),
    ".": (PUNCT, {}),
    ",": (PUNCT, {}),
    "``": (PUNCT, {}),
    ":": (PUNCT, {}),
    "?": (PUNCT, {}),
 }
 POS_TEMPLATES = (
    (W_sic,),
    (P1_lemma, P1_pos),
    (P2_lemma, P2_pos),
    (N1_sic,),
    (N2_sic,),
    (W_suffix,),
    (W_prefix,),
    (P1_pos,),
    (P2_pos,),
    (P1_pos, P2_pos),
    (P1_pos, W_sic),
    (P1_suffix,),
    (N1_suffix,),
    (W_shape,),
    (W_cluster,),
    (N1_cluster,),
    (N2_cluster,),
    (P1_cluster,),
    (P2_cluster,),
    (W_pos_type,),
    (N1_pos_type,),
    (N1_pos_type,),
    (P1_pos, W_pos_type, N1_pos_type),
 )
 cdef class English(Language):
    """English tokenizer, tightly coupled to lexicon.
    Attributes:
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """
    def get_props(self, unicode string):
        return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
    def set_flags(self, unicode string):
        cdef flags_t flags = 0
        flags |= orth.is_alpha(string) << IS_ALPHA
        flags |= orth.is_ascii(string) << IS_ASCII
        flags |= orth.is_digit(string) << IS_DIGIT
        flags |= orth.is_lower(string) << IS_LOWER
        flags |= orth.is_punct(string) << IS_PUNCT
        flags |= orth.is_space(string) << IS_SPACE
        flags |= orth.is_title(string) << IS_TITLE
        flags |= orth.is_upper(string) << IS_UPPER
        flags |= orth.like_url(string) << LIKE_URL
        flags |= orth.like_number(string) << LIKE_NUMBER
        return flags
    def set_pos(self, Tokens tokens):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
        cdef TokenC* t = tokens.data
        cdef id_t[2] bigram
        cdef hash_t cache_key
        cdef void* cached = NULL
        assert self.morphologizer is not None
        cdef dict tagdict = self.pos_tagger.tagdict
        for i in range(tokens.length):
            fill_pos_context(context, i, t)
            t[i].pos = self.pos_tagger.predict(context)
            self.morphologizer.set_morph(i, t)
    def train_pos(self, Tokens tokens, golds):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
        c = 0
        cdef TokenC* t = tokens.data
        for i in range(tokens.length):
            fill_pos_context(context, i, t)
            t[i].pos = self.pos_tagger.predict(context, [golds[i]])
            self.morphologizer.set_morph(i, t)
            c += t[i].pos == golds[i]
        return c
    cdef int is_base_np_end(self, const TokenC* token) except -1:
        pass
    cdef int is_outside_base_np(self, const TokenC* token) except -1:
        pass
 cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
    _fill_from_token(&context[P2_sic], &tokens[i-2])
    _fill_from_token(&context[P1_sic], &tokens[i-1])
    _fill_from_token(&context[W_sic], &tokens[i])
    _fill_from_token(&context[N1_sic], &tokens[i+1])
    _fill_from_token(&context[N2_sic], &tokens[i+2])
 cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
    context[0] = t.lex.sic
    context[1] = t.lex.cluster
    context[2] = t.lex.shape
    context[3] = t.lex.prefix
    context[4] = t.lex.suffix
    context[5] = t.pos
    context[6] = t.lemma
    context[7] = t.lex.pos_type
 EN = English('en')
--- a/spacy/index.pxd
+++ b/spacy/index.pxd
@ -1,44 +0,0 @@
 from libcpp.vector cimport vector
 from libcpp.pair cimport pair
 from preshed.counter cimport count_t
 from preshed.maps cimport PreshMap
 from preshed.counter cimport PreshCounter
 from cymem.cymem cimport Pool
 from .lang cimport Lexicon
 from .tokens cimport Tokens, TokenC
 from .typedefs cimport id_t
 from .lexeme cimport attr_id_t
 from .typedefs cimport attr_t
 from .typedefs cimport hash_t
 from murmurhash.mrmr cimport hash64
 ctypedef vector[pair[id_t, count_t]] count_vector_t
 cdef class Index:
    cdef attr_id_t attr_id
    cdef readonly attr_t max_value
    cdef vector[count_vector_t] counts
    cpdef int count(self, Tokens tokens) except -1
 cdef class DecisionMemory:
    cdef int n_classes
    cdef Pool mem
    cdef PreshCounter _counts
    cdef PreshCounter _class_counts
    cdef PreshMap memos
    cdef list class_names
    cdef int inc(self, hash_t context_key, hash_t clas, count_t inc) except -1
    cdef int find_best_class(self, count_t* counts, hash_t context_key) except -1
    cdef inline int get(self, hash_t context_key) nogil:
        return <int><size_t>self.memos.get(context_key) - 1
--- a/spacy/index.pyx
+++ b/spacy/index.pyx
@ -1,120 +0,0 @@
 """Create a term-document matrix"""
 cimport cython
 from libc.stdint cimport int64_t
 from libc.string cimport memmove
 from cymem.cymem cimport Address
 from .lexeme cimport Lexeme, get_attr
 from .tokens cimport TokenC
 from .typedefs cimport hash_t
 from preshed.maps cimport MapStruct, Cell, map_get, map_set, map_init
 from murmurhash.mrmr cimport hash64
 cdef class Index:
    def __init__(self, attr_id_t attr_id):
        self.attr_id = attr_id
        self.max_value = 0
    cpdef int count(self, Tokens tokens) except -1:
        cdef PreshCounter counts = PreshCounter(2 ** 8)
        cdef attr_id_t attr_id = self.attr_id
        cdef attr_t term
        cdef int i
        for i in range(tokens.length):
            term = get_attr(tokens.data[i].lex, attr_id)
            counts.inc(term, 1)
            if term > self.max_value:
                self.max_value = term
        cdef count_t count
        cdef count_vector_t doc_counts
        for term, count in counts:
            doc_counts.push_back(pair[id_t, count_t](term, count))
        self.counts.push_back(doc_counts)
 cdef class DecisionMemory:
    def __init__(self, class_names):
        self.class_names = class_names
        self.n_classes = len(class_names)
        self.mem = Pool()
        self._counts = PreshCounter()
        self._class_counts = PreshCounter()
        self.memos = PreshMap()
    def load(self, loc, thresh=50):
        cdef:
            count_t freq
            hash_t key
            int clas
        for line in open(loc):
            freq, key, clas = [int(p) for p in line.split()]
            if thresh == 0 or freq >= thresh:
                self.memos.set(key, <void*>(clas+1))
    def __getitem__(self, ids):
        cdef id_t[2] context
        context[0] = context[0]
        context[1] = context[1]
        cdef hash_t context_key = hash64(context, 2 * sizeof(id_t), 0)
        cdef hash_t[2] class_context
        class_context[0] = context_key
        counts = {}
        cdef id_t i
        for i, clas in enumerate(self.clas_names):
            class_context[1] = <hash_t>i
            key = hash64(class_context, sizeof(hash_t) * 2, 0)
            count = self._class_counts[key]
            counts[clas] = count
        return counts
    @cython.cdivision(True)
    def iter_contexts(self, float min_acc=0.99, count_t min_freq=10):
        cdef Address counts_addr = Address(self.n_classes, sizeof(count_t))
        cdef count_t* counts = <count_t*>counts_addr.ptr
        cdef MapStruct* context_counts = self._counts.c_map
        cdef hash_t context_key
        cdef count_t context_freq
        cdef int best_class
        cdef float acc
        cdef int i
        for i in range(context_counts.length):
            context_key = context_counts.cells[i].key
            context_freq = <count_t>context_counts.cells[i].value
            if context_key != 0 and context_freq >= min_freq:
                best_class = self.find_best_class(counts, context_key)
                acc = counts[best_class] / context_freq
                if acc >= min_acc:
                    yield counts[best_class], context_key, best_class
    cdef int inc(self, hash_t context_key, hash_t clas, count_t inc) except -1:
        cdef hash_t context_and_class_key
        cdef hash_t[2] context_and_class
        context_and_class[0] = context_key
        context_and_class[1] = clas
        context_and_class_key = hash64(context_and_class, 2 * sizeof(hash_t), 0)
        self._counts.inc(context_key, inc)
        self._class_counts.inc(context_and_class_key, inc)
    cdef int find_best_class(self, count_t* counts, hash_t context_key) except -1:
        cdef hash_t[2] unhashed_key
        unhashed_key[0] = context_key
        cdef count_t total = 0
        cdef hash_t key
        cdef int clas
        cdef int best
        cdef int mode = 0
        for clas in range(self.n_classes):
            unhashed_key[1] = <hash_t>clas
            key = hash64(unhashed_key, sizeof(hash_t) * 2, 0)
            count = self._class_counts[key]
            counts[clas] = count
            if count >= mode:
                mode = count
                best = clas
            total += count
        return best
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -1,90 +0,0 @@
 from os import path
 NOUN_RULES = (
    ('s', ''),
    ('ses', 's'),
    ('ves', 'f'),
    ('xes', 'x'),
    ('zes', 'z'),
    ('ches', 'ch'),
    ('shes', 'sh'),
    ('men', 'man'),
    ('ies', 'y')
 )
 VERB_RULES = (
    ("s", ""),
    ("ies", "y"),
    ("es", "e"),
    ("es", ""),
    ("ed", "e"),
    ("ed", ""),
    ("ing", "e"),
    ("ing", "")
 )
 ADJ_RULES = (
    ("er", ""),
    ("est", ""),
    ("er", "e"),
    ("est", "e")
 )
 class Lemmatizer(object):
    def __init__(self, wn_dict_dir):
        self.index = {}
        self.exc = {}
        for pos in ['adj', 'adv', 'noun', 'verb']:
            self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
            self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
    def noun(self, string):
        return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
    def verb(self, string):
        return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
    def adj(self, string):
        return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
 def lemmatize(string, index, exceptions, rules):
    string = string.lower()
    forms = []
    if string in index:
        forms.append(string)
    forms.extend(exceptions.get(string, []))
    for old, new in rules:
        if string.endswith(old):
            form = string[:len(string) - len(old)] + new
            if form in index:
                forms.append(form)
    if not forms:
        forms.append(string)
    return set(forms)
 def read_index(loc):
    index = set()
    for line in open(loc):
        if line.startswith(' '):
            continue
        pieces = line.split()
        word = pieces[0]
        if word.count('_') == 0:
            index.add(word)
    return index
 def read_exc(loc):
    exceptions = {}
    for line in open(loc):
        if line.startswith(' '):
            continue
        pieces = line.split()
        exceptions[pieces[0]] = tuple(pieces[1:])
    return exceptions
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -36,11 +36,11 @@ cdef struct _Cached:
 cdef class Morphologizer:
    """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
    """
-    def __init__(self, StringStore strings, object lemmatizer, **kwargs):
+    def __init__(self, StringStore strings, object lemmatizer,
                 irregulars=None, tag_map=None, tag_names=None):
        self.mem = Pool()
        self.strings = strings
-        tag_map = kwargs['tag_map']
+        self.tag_names = tag_names
        self.tag_names = kwargs['tag_names']
        self.lemmatizer = lemmatizer
        self._cache = PreshMapArray(len(self.tag_names))
        self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
@ -55,9 +55,16 @@ cdef class Morphologizer:
            self.tags[i].morph.person = props.get('person', 0)
            self.tags[i].morph.case = props.get('case', 0)
            self.tags[i].morph.misc = props.get('misc', 0)
-        #if path.exists(path.join(data_dir, 'morphs.json')):
+        if irregulars is not None:
-        #    with open(path.join(data_dir, 'morphs.json')) as file_:
+            self.load_exceptions(irregulars)
-        #        self.load_exceptions(json.load(file_))
+
    @classmethod
    def from_dir(cls, StringStore strings, object lemmatizer, data_dir):
        tag_map = None
        irregulars = None
        tag_names = None
        return cls(strings, lemmatizer, tag_map=tag_map, irregulars=irregulars,
                   tag_names=tag_names)
    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
        if self.lemmatizer is None:
@ -86,7 +93,6 @@ cdef class Morphologizer:
            cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
            cached.morph = tag.morph
            self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
        tokens[i].lemma = cached.lemma
        tokens[i].morph = cached.morph
--- a/spacy/ner/init.pxd
+++ b/spacy/ner/init.pxd
--- a/spacy/ner/init.py
+++ b/spacy/ner/init.py
--- a/spacy/ner/_feats.pxd
+++ b/spacy/ner/_feats.pxd
--- a/spacy/ner/_feats.pyx
+++ b/spacy/ner/_feats.pyx
@ -1,169 +0,0 @@
 from spacy.context cimport FIELD_IDS, Token
 cdef Token P4 = FIELD_IDS.P4
 cdef Token P3 = FIELD_IDS.P3
 cdef Token P2 = FIELD_IDS.P2
 cdef Token P1 = FIELD_IDS.P1
 cdef Token N0 = FIELD_IDS.N0
 cdef Token N1 = FIELD_IDS.N1
 cdef Token N2 = FIELD_IDS.N2
 cdef Token N3 = FIELD_IDS.N3
 cdef Token N4 = FIELD_IDS.N4
 """
 TEMPLATES = (
    (N0.sic,),
    (N0.cluster,),
    (P1.pos,),
    (P1.sic,),
    (N1.norm,),
    (N1.pos,),
    (P1.ner,),
    (P2.ner,),
    (N0.cluster,),
    (P1.cluster,),
    (N1.cluster,),
    (N0.is_alpha,),
    (N0.is_digit,),
    (N0.is_title,),
    (N0.is_upper,),
    (N0.is_title, N0.oft_title),
    (N0.is_upper, N0.oft_upper),
    (P1.cluster, N0.norm),
    (N0.norm, N1.cluster),
    (P1.ner, N0.pos),
    (P2.ner, P1.ner, N0.pos),
    (P2.pos, P1.pos, N0.sic),
    (N0.sic, N1.pos, N2.pos)
 )
 """
 LOCAL = (
    (N0.sic,),
    (P1.sic,),
    (N1.sic,),
    (P2.sic,),
    (N2.sic,),
    (P3.sic,),
    (N3.sic,),
    (P4.sic,),
    (N4.sic,),
    (P1.sic, N0.sic,),
    (N0.sic, N1.sic),
    (N0.prefix,),
    (N0.suffix,),
    (P1.shape,),
    (N0.shape,),
    (N1.shape,),
    (P1.shape, N0.shape,),
    (N0.shape, P1.shape,),
    (P1.shape, N0.shape, N1.shape),
    (N2.shape,),
    (P2.shape,),
    (P3.shape,),
    (N3.shape,),
    (P4.shape,),
    (N4.shape,),
    (P2.norm, P1.norm, N0.norm),
    (P1.norm, N0.norm, N1.norm),
    (N0.norm, N1.norm, N2.norm)
 )
 BOOLS = (
    (N0.is_title,),
 )
 HISTORY = (
    (P1.ner,),
    (P1.ner, N0.sic,),
    (P2.ner,),
    (P2.ner, P1.ner),
    (P2.ner, P1.ner, N0.sic),
    (P2.pos, P1.ner, N0.pos),
    (P2.ner, P1.pos, N0.pos),
    (P3.ner,),
    (P4.ner,),
 )
 POS = (
    (P4.pos,),
    (P3.pos,),
    (P2.pos,),
    (P1.pos,),
    (N0.pos,),
    (N1.pos,),
    (N2.pos,),
    (N3.pos,),
    (N4.pos,),
    (P1.pos, N0.pos),
    (N0.pos, N1.pos),
    (P2.pos, P1.pos, N0.pos),
    (P1.pos, N0.pos, N1.pos),
    (N0.pos, N1.pos, N2.pos)
 )
 CLUSTERS = (
    (P4.cluster,),
    (P3.cluster,),
    (P2.cluster,),
    (P1.cluster,),
    (N0.cluster,),
    (N1.cluster,),
    (N2.cluster,),
    (N3.cluster,),
    (N4.cluster,),
    (P1.cluster, N0.cluster),
    (N0.cluster, N1.cluster),
 )
 CLUSTER_POS = (
    (P1.cluster, N0.pos),
    (N0.pos, P1.cluster),
    (N0.cluster, N1.pos),
    (N0.pos, N1.cluster)
 )
 GAZ = (
    (N0.in_males,),
    (N0.in_females,),
    (N0.in_surnames,),
    (N0.in_places,),
    (N0.in_games,),
    (N0.in_celebs,),
    (N0.in_names,),
    (P1.in_males,),
    (P1.in_females,),
    (P1.in_surnames,),
    (P1.in_places,),
    (P1.in_games,),
    (P1.in_celebs,),
    (P1.in_names,),
    (N1.in_males,),
    (N1.in_females,),
    (N1.in_surnames,),
    (N1.in_places,),
    (N1.in_games,),
    (N1.in_celebs,),
    (N1.in_names,),
 )
 TEMPLATES = LOCAL + HISTORY + CLUSTERS + POS + CLUSTER_POS + GAZ + BOOLS
--- a/spacy/ner/_state.pxd
+++ b/spacy/ner/_state.pxd
@ -1,15 +0,0 @@
 from cymem.cymem cimport Pool
 from .structs cimport State, Entity, Move
 cdef int begin_entity(State* s, label) except -1
 cdef int end_entity(State* s) except -1
 cdef State* init_state(Pool mem, int sent_length) except NULL
 cdef int copy_state(Pool mem, State* dest, State* source) except -1
 cdef bint entity_is_open(State *s) except -1
 cdef int entity_is_sunk(State *s, Move* golds) except -1
 cdef int is_done(State* s) except -1
--- a/spacy/ner/_state.pyx
+++ b/spacy/ner/_state.pyx
@ -1,54 +0,0 @@
 from libc.string cimport memcpy
 cdef int begin_entity(State* s, label) except -1:
    s.j += 1
    s.ents[s.j].start = s.i
    s.ents[s.j].tag = label
    s.ents[s.j].end = s.i + 1
 cdef int end_entity(State* s) except -1:
    s.ents[s.j].end = s.i + 1
 cdef State* init_state(Pool mem, int sent_length) except NULL:
    s = <State*>mem.alloc(1, sizeof(State))
    s.ents = <Entity*>mem.alloc(sent_length, sizeof(Entity))
    s.tags = <int*>mem.alloc(sent_length, sizeof(int))
    s.length = sent_length
 cdef bint entity_is_open(State *s) except -1:
    return s.ents[s.j].start != 0
 cdef int entity_is_sunk(State *s, Move* golds) except -1:
    if not entity_is_open(s):
        return False
    raise StandardError
    #cdef Entity* ent = &s.ents[s.j]
    #cdef Move* gold = &golds[ent.start]
    #if gold.action != BEGIN and gold.action != UNIT:
    #    return True
    #elif gold.label != ent.label:
    #    return True
    #else:
    #    return False
 cdef int copy_state(Pool mem, State* dest, State* source) except -1:
    '''Copy state source into state dest.'''
    if source.length > dest.length:
        dest.ents = <Entity*>mem.realloc(dest.ents, source.length * sizeof(Entity))
        dest.tags = <int*>mem.realloc(dest.tags, source.length * sizeof(int))
    memcpy(dest.ents, source.ents, source.length * sizeof(Entity))
    memcpy(dest.tags, source.tags, source.length * sizeof(int))
    dest.length = source.length
    dest.i = source.i
    dest.j = source.j
    dest.curr = source.curr
 cdef int is_done(State* s) except -1:
    return s.i >= s.length and not entity_is_open(s)
--- a/spacy/ner/annot.pxd
+++ b/spacy/ner/annot.pxd
@ -1,8 +0,0 @@
 from cymem.cymem cimport Pool
 cdef class NERAnnotation:
    cdef Pool mem
    cdef int* starts
    cdef int* ends
    cdef int* labels
    cdef readonly list entities
--- a/spacy/ner/annot.pyx
+++ b/spacy/ner/annot.pyx
@ -1,94 +0,0 @@
 from libc.string cimport memset
 cdef class NERAnnotation:
    def __init__(self, entities, length, entity_types):
        self.mem = Pool()
        self.starts = <int*>self.mem.alloc(length, sizeof(int))
        self.ends = <int*>self.mem.alloc(length, sizeof(int))
        self.labels = <int*>self.mem.alloc(length, sizeof(int))
        self.entities = entities
        memset(self.starts, -1, sizeof(int) * length)
        memset(self.ends, -1, sizeof(int) * length)
        memset(self.labels, -1, sizeof(int) * length)
        cdef int start, end, label
        for start, end, label in entities:
            for i in range(start, end):
                self.starts[i] = start
                self.ends[i] = end
                self.labels[i] = label
    @classmethod
    def from_bilous(cls, tag_strs, entity_types):
        entities = []
        start = None
        for i, tag_str in enumerate(tag_strs):
            if tag_str == 'O' or tag_str == '-':
                continue
            move, label_str = tag_str.split('-')
            label = entity_types.index(label_str)
            if label == -1:
                label = len(entity_types)
                entity_types.append(label)
            if move == 'U':
                assert start is None
                entities.append((i, i+1, label))
            elif move == 'B':
                assert start is None
                start = i
            elif move == 'L':
                assert start is not None
                entities.append((start, i+1, label))
                start = None
        return cls(entities, len(tag_strs), entity_types)
 def read_iob(file_, entity_types, create_tokens):
    sent_strs = file_.read().strip().split('\n\n')
    sents = []
    for sent_str in sent_strs:
        if sent_str.startswith('-DOCSTART-'):
            continue
        words = []
        iob = []
        for token_str in sent_str.split('\n'):
            word, pos, chunk, ner = token_str.split()
            words.append(word)
            iob.append(ner)
        bilou = iob_to_bilou(iob)
        tokens = create_tokens(words)
        sents.append((tokens, NERAnnotation.from_bilous(bilou, entity_types)))
    return sents
 def iob_to_bilou(tags):
    out = []
    curr_label = None
    tags = list(tags)
    while tags:
        out.extend(_consume_os(tags))
        out.extend(_consume_ent(tags))
    return out
 def _consume_os(tags):
    while tags and tags[0] == 'O':
        yield tags.pop(0)
 def _consume_ent(tags):
    if not tags:
        return []
    target = tags.pop(0).replace('B', 'I')
    length = 1
    while tags and tags[0] == target:
        length += 1
        tags.pop(0)
    label = target[2:]
    if length == 1:
        return ['U-' + label]
    else:
        start = 'B-' + label
        end = 'L-' + label
        middle = ['I-%s' % label for _ in range(1, length - 1)]
        return [start] + middle + [end]
--- a/spacy/ner/bilou_moves.pxd
+++ b/spacy/ner/bilou_moves.pxd
@ -1,27 +0,0 @@
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport class_t
 from thinc.typedefs cimport weight_t
 from .structs cimport State, Move
 cpdef enum ActionType:
    MISSING
    BEGIN
    IN
    LAST
    UNIT
    OUT
    N_ACTIONS
 cdef int set_accept_if_oracle(Move* moves, Move* golds, int n, State* s) except 0
 cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
 cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
 cdef int transition(State *s, Move* m) except -1
 cdef int fill_moves(Move* moves, list tag_names) except -1
--- a/spacy/ner/bilou_moves.pyx
+++ b/spacy/ner/bilou_moves.pyx
@ -1,207 +0,0 @@
 from __future__ import unicode_literals
 from ._state cimport begin_entity
 from ._state cimport end_entity
 from ._state cimport entity_is_open
 from ._state cimport entity_is_sunk
 ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
 ACTION_NAMES[<int>MISSING] = '?'
 ACTION_NAMES[<int>BEGIN] = 'B'
 ACTION_NAMES[<int>IN] = 'I'
 ACTION_NAMES[<int>LAST] = 'L'
 ACTION_NAMES[<int>UNIT] = 'U'
 ACTION_NAMES[<int>OUT] = 'O'
 cdef bint can_begin(State* s, int label):
    return not entity_is_open(s)
 cdef bint can_in(State* s, int label):
    return entity_is_open(s) and s.curr.label == label
 cdef bint can_last(State* s, int label):
    return entity_is_open(s) and s.curr.label == label
 cdef bint can_unit(State* s, int label):
    return not entity_is_open(s)
 cdef bint can_out(State* s, int label):
    return not entity_is_open(s)
 cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag,
                    ActionType next_act, bint is_sunk):
    if g_act == MISSING:
        return True
    if act == BEGIN:
        if g_act == BEGIN:
            # B, Gold B --> Label match
            return tag == g_tag
        else:
            # B, Gold I --> False (P)
            # B, Gold L --> False (P)
            # B, Gold O --> False (P)
            # B, Gold U --> False (P)
            return False
    elif act == IN:
        if g_act == BEGIN:
            # I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
            return True
        elif g_act == IN:
            # I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
            return True
        elif g_act == LAST:
            # I, Gold L --> True iff this entity sunk and next tag == O
            return is_sunk and (next_act == OUT or next_act == MISSING)
        elif g_act == OUT:
            # I, Gold O --> True iff next tag == O
            return next_act == OUT or next_act == MISSING
        elif g_act == UNIT:
            # I, Gold U --> True iff next tag == O
            return next_act == OUT
    elif act == LAST:
        if g_act == BEGIN:
            # L, Gold B --> True
            return True
        elif g_act == IN:
            # L, Gold I --> True iff this entity sunk
            return is_sunk
        elif g_act == LAST:
            # L, Gold L --> True
            return True
        elif g_act == OUT:
            # L, Gold O --> True
            return True
        elif g_act == UNIT:
            # L, Gold U --> True
            return True
    elif act == OUT:
        if g_act == BEGIN:
            # O, Gold B --> False
            return False
        elif g_act == IN:
            # O, Gold I --> True
            return True
        elif g_act == LAST:
            # O, Gold L --> True
            return True
        elif g_act == OUT:
            # O, Gold O --> True
            return True
        elif g_act == UNIT:
            # O, Gold U --> False
            return False
    elif act == UNIT:
        if g_act == UNIT:
            # U, Gold U --> True iff tag match
            return tag == g_tag
        else:
            # U, Gold B --> False
            # U, Gold I --> False
            # U, Gold L --> False
            # U, Gold O --> False
            return False
 cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0:
    cdef int n_accept = 0
    cdef Move* m
    moves[0].accept = False
    for i in range(1, n_classes):
        m = &moves[i]
        if m.action == BEGIN:
            m.accept = can_begin(s, m.label)
        elif m.action == IN:
            m.accept = can_in(s, m.label)
        elif m.action == LAST:
            m.accept = can_last(s, m.label)
        elif m.action == UNIT:
            m.accept = can_unit(s, m.label)
        elif m.action == OUT:
            m.accept = can_out(s, m.label)
        n_accept += m.accept
    assert n_accept != 0
    return n_accept
 cdef int set_accept_if_oracle(Move* moves, Move* golds, int n_classes, State* s) except 0:
    cdef Move* g = &golds[s.i]
    cdef ActionType next_act = <ActionType>golds[s.i+1].action if s.i < s.length else OUT
    cdef bint is_sunk = entity_is_sunk(s, golds)
    cdef Move* m
    cdef int n_accept = 0
    set_accept_if_valid(moves, n_classes, s)
    for i in range(1, n_classes):
        m = &moves[i]
        if not m.accept:
            continue
        m.accept = is_oracle(<ActionType>m.action, m.label, <ActionType>g.action,
                             g.label, next_act, is_sunk)
        n_accept += m.accept
    assert n_accept != 0
    return n_accept
 cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
    cdef int first_accept = -1
    for first_accept in range(1, n):
        if moves[first_accept].accept:
            break
    else:
        raise StandardError
    assert first_accept != -1
    cdef int best = first_accept
    cdef weight_t score = scores[first_accept-1]
    cdef int i
    for i in range(first_accept+1, n): 
        if moves[i].accept and scores[i-1] > score:
            best = i
            score = scores[i-1]
    return &moves[best]
 cdef int transition(State *s, Move* move) except -1:
    if move.action == BEGIN:
        begin_entity(s, move.label)
    elif move.action == IN:
        pass
    elif move.action == LAST:
        end_entity(s)
    elif move.action == UNIT:
        begin_entity(s, move.label)
        end_entity(s)
    elif move.action == OUT:
        pass
    s.tags[s.i] = move.clas 
    s.i += 1
 def get_n_moves(n_tags):
    return n_tags + n_tags + n_tags + n_tags + 1
 cdef int fill_moves(Move* moves, list tag_names) except -1:
    cdef Move* m
    label_names = {'-': 0}
    for i, tag_name in enumerate(tag_names):
        m = &moves[i]
        if '-' in tag_name:
            action_str, label = tag_name.split('-')
        elif tag_name == 'O':
            action_str = 'O'
            label = '-'
        elif tag_name == 'NULL' or tag_name == 'EOL':
            action_str = '?'
            label = '-'
        else:
            raise StandardError(tag_name)
        m.action = ACTION_NAMES.index(action_str)
        m.label = label_names.setdefault(label, len(label_names))
        m.clas = i
--- a/spacy/ner/context.pxd
+++ b/spacy/ner/context.pxd
@ -1,155 +0,0 @@
 from thinc.typedefs cimport atom_t
 from ..typedefs cimport hash_t
 from ..tokens cimport Tokens
 from ..lexeme cimport Lexeme
 from .structs cimport State
 cpdef enum:
    T_sic
    T_cluster
    T_norm
    T_shape
    T_asciied
    T_prefix
    T_suffix
    T_length
    T_postype
    T_nertype
    T_sensetype
    T_is_alpha
    T_is_ascii
    T_is_digit
    T_is_lower
    T_is_punct
    T_is_space
    T_is_title
    T_is_upper
    T_like_url
    T_like_number
    T_oft_lower
    T_oft_title
    T_oft_upper
    T_in_males
    T_in_females
    T_in_surnames
    T_in_places
    T_in_celebs
    T_in_names
    T_pos
    T_sense
    T_ner
 cpdef enum:
    P2_sic
    P2_cluster
    P2_norm
    P2_shape
    P2_prefix
    P2_suffix
    P2_length
    P2_postype
    P2_is_alpha
    P2_is_digit
    P2_is_lower
    P2_is_punct
    P2_is_title
    P2_is_upper
    P2_like_number
    P2_pos
    P1_sic
    P1_cluster
    P1_norm
    P1_shape
    P1_prefix
    P1_suffix
    P1_length
    P1_postype
    P1_is_alpha
    P1_is_digit
    P1_is_lower
    P1_is_punct
    P1_is_title
    P1_is_upper
    P1_like_number
    P1_pos
    W_sic
    W_cluster
    W_norm
    W_shape
    W_prefix
    W_suffix
    W_length
    W_postype
    W_is_alpha
    W_is_digit
    W_is_lower
    W_is_punct
    W_is_space
    W_is_title
    W_is_upper
    W_like_number
    W_pos
    N1_sic
    N1_cluster
    N1_norm
    N1_shape
    N1_prefix
    N1_suffix
    N1_length
    N1_postype
    N1_is_alpha
    N1_is_ascii
    N1_is_digit
    N1_is_lower
    N1_is_punct
    N1_is_space
    N1_is_title
    N1_is_upper
    N1_like_number
    N1_pos
    N2_sic
    N2_cluster
    N2_norm
    N2_shape
    N2_asciied
    N2_prefix
    N2_suffix
    N2_length
    N2_postype
    N2_is_alpha
    N2_is_digit
    N2_is_lower
    N2_is_punct
    N2_is_space
    N2_is_title
    N2_is_upper
    N2_like_number
    N2_pos
    N2_sense
    E_label
    E0_sic
    E0_cluster
    E0_pos
    E1_sic
    E1_cluster
    E1_pos
    E_last_sic
    E_last_cluster
    E_last_pos
    N_FIELDS
 cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1
--- a/spacy/ner/context.pyx
+++ b/spacy/ner/context.pyx
@ -1,77 +0,0 @@
 from libc.string cimport memset
 from murmurhash.mrmr cimport hash64
 from ._state cimport entity_is_open
 from ..lexeme cimport *
 cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos):
    c[T_sic] = lex.sic
    c[T_cluster] = lex.cluster
    c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
    c[T_shape] = lex.shape
    c[T_asciied] = lex.asciied
    c[T_prefix] = lex.prefix
    c[T_suffix] = lex.suffix
    c[T_length] = lex.length
    c[T_postype] = lex.postype
    c[T_nertype] = 0
    c[T_sensetype] = 0
    c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)
    c[T_is_digit] = lex.flags & (1 << IS_DIGIT)
    c[T_is_lower] = lex.flags & (1 << IS_LOWER)
    c[T_is_punct] = lex.flags & (1 << IS_PUNCT)
    c[T_is_space] = lex.flags & (1 << IS_SPACE)
    c[T_is_title] = lex.flags & (1 << IS_TITLE)
    c[T_is_upper] = lex.flags & (1 << IS_UPPER)
    c[T_like_url] = lex.flags & (1 << LIKE_URL)
    c[T_like_number] = lex.flags & (1 << LIKE_NUMBER)
    c[T_oft_lower] = lex.flags & (1 << OFT_LOWER)
    c[T_oft_title] = lex.flags & (1 << OFT_TITLE)
    c[T_oft_upper] = lex.flags & (1 << OFT_UPPER)
    c[T_in_males] = lex.flags & (1 << IN_MALES)
    c[T_in_females] = lex.flags & (1 << IN_FEMALES)
    c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES)
    c[T_in_places] = lex.flags & (1 << IN_PLACES)
    c[T_in_celebs] = lex.flags & (1 << IN_CELEBS)
    c[T_in_names] = lex.flags & (1 << IN_NAMES)
    c[T_pos] = pos
    c[T_sense] = 0
 cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos):
    c[0] = lex.sic
    c[1] = lex.cluster
    c[2] = lex.shape
    c[3] = pos
 cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1:
    cdef int i
    for i in range(N_FIELDS):
        context[i] = 0
    i = s.i
    _fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2])
    _fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1])
    _fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i])
    _fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1])
    _fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2])
    cdef atom_t[5] ent_vals
    if entity_is_open(s):
        context[E_label] = s.curr.label
        context[E0_sic] = tokens.lex[s.curr.start].sic
        context[E0_cluster] = tokens.lex[s.curr.start].cluster
        context[E0_pos] = tokens.pos[s.curr.start]
        context[E_last_sic] = tokens.lex[s.i-1].sic
        context[E_last_cluster] = tokens.lex[s.i-1].cluster
        context[E_last_pos] = tokens.pos[s.i-1]
        if (s.curr.start + 1) < s.i:
            context[E1_sic] = tokens.lex[s.curr.start+1].sic
            context[E1_cluster] = tokens.lex[s.curr.start+1].cluster
            context[E1_pos] = tokens.pos[s.curr.start+1]
    return 1
--- a/spacy/ner/feats.pxd
+++ b/spacy/ner/feats.pxd
--- a/spacy/ner/feats.pyx
+++ b/spacy/ner/feats.pyx
@ -1,107 +0,0 @@
 from .context import *
 LOCAL = (
    (W_sic,),
    (P1_sic,),
    (N1_sic,),
    (P2_sic,),
    (N2_sic,),
    (P1_sic, W_sic,),
    (W_sic, N1_sic),
    (W_prefix,),
    (W_suffix,),
    (P1_shape,),
    (W_shape,),
    (N1_shape,),
    (P1_shape, W_shape,),
    (W_shape, P1_shape,),
    (P1_shape, W_shape, N1_shape),
    (N2_shape,),
    (P2_shape,),
    (P2_norm, P1_norm, W_norm),
    (P1_norm, W_norm, N1_norm),
    (W_norm, N1_norm, N2_norm)
 )
 POS = (
    (P2_pos,),
    (P1_pos,),
    (W_pos,),
    (N1_pos,),
    (N2_pos,),
    (P1_pos, W_pos),
    (W_pos, N1_pos),
    (P2_pos, P1_pos, W_pos),
    (P1_pos, W_pos, N1_pos),
    (W_pos, N1_pos, N2_pos)
 )
 CLUSTERS = (
    (P2_cluster,),
    (P1_cluster,),
    (W_cluster,),
    (N1_cluster,),
    (N2_cluster,),
    (P1_cluster, W_cluster),
    (W_cluster, N1_cluster),
 )
 CLUSTER_POS = (
    (P1_cluster, W_pos),
    (W_pos, P1_cluster),
    (W_cluster, N1_pos),
    (W_pos, N1_cluster)
 )
 STATE = (
   (E0_sic,),
   (E0_cluster,),
   (E0_pos,),
   (E_last_sic,),
   (E_last_cluster,),
   (E_last_pos,),
   (E0_sic, W_sic),
   (E0_cluster, W_cluster),
   (E0_pos, W_pos),
   (E_last_sic, W_sic),
   (E_last_pos, W_pos),
   (E0_pos, E_last_pos, W_pos),
   (E0_cluster, E_last_cluster, W_cluster),
   (E0_sic, E_last_sic),
   (E0_pos, E_last_pos),
   (E0_cluster, E_last_cluster),
   (E0_pos, E_last_cluster),
   (E0_cluster, E_last_pos),
   (E1_sic,),
   (E1_cluster,),
   (E1_pos,),
   (E0_sic, E1_sic),
   (E0_sic, E1_pos,),
   (E0_pos, E1_sic,),
   (E0_pos, E1_pos),
   (E_label,),
   (E_label, W_sic),
   (E_label, W_pos),
   (E_label, W_cluster),
   (E_label, W_shape),
   (E_label, E_last_sic),
   (E_label, E0_pos, E_last_pos),
 )
 TEMPLATES = LOCAL + CLUSTERS + POS + CLUSTER_POS + STATE
--- a/spacy/ner/greedy_parser.pxd
+++ b/spacy/ner/greedy_parser.pxd
@ -1,29 +0,0 @@
 from cymem.cymem cimport Pool
 from thinc.features cimport Extractor
 from thinc.learner cimport LinearModel
 from thinc.typedefs cimport *
 from ..tokens cimport Tokens
 from ..typedefs cimport *
 from .structs cimport Move
 from .annot cimport NERAnnotation
 cdef class NERParser:
    cdef Pool mem
    cdef Extractor extractor
    cdef LinearModel model
    cdef readonly list tag_names
    cdef readonly list entity_types
    cdef readonly int n_classes
    cdef Move* _moves
    cdef atom_t* _context
    cdef feat_t* _feats
    cdef weight_t* _values
    cdef weight_t* _scores
    cpdef list train(self, Tokens tokens, NERAnnotation annot)
    cpdef list set_tags(self, Tokens tokens)
--- a/spacy/ner/greedy_parser.pyx
+++ b/spacy/ner/greedy_parser.pyx
@ -1,81 +0,0 @@
 cimport cython
 import random
 import os
 from os import path
 import shutil
 import json
 from thinc.features cimport ConjFeat
 from ..context cimport fill_context
 from ..context cimport N_FIELDS
 from .moves cimport Move
 from .moves cimport fill_moves, transition, best_accepted
 from .moves cimport set_accept_if_valid, set_accept_if_oracle
 from .moves import get_n_moves
 from ._state cimport State
 from ._state cimport init_state
 cdef class NERParser:
    def __init__(self, model_dir):
        self.mem = Pool()
        cfg = json.load(open(path.join(model_dir, 'config.json')))
        templates = cfg['templates']
        self.entity_types = cfg['entity_types']
        self.extractor = Extractor(templates, [ConjFeat] * len(templates))
        self.n_classes = get_n_moves(len(self.entity_types))
        self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
        fill_moves(self._moves, len(self.entity_types))
        self.model = LinearModel(len(self.tag_names))
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))
        self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
        self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
    cpdef int train(self, Tokens tokens, gold_classes):
        cdef Pool mem = Pool()
        cdef State* s = init_state(mem, tokens.length)
        cdef Move* golds = <Move*>mem.alloc(len(gold_classes), sizeof(Move))
        for i, clas in enumerate(gold_classes):
            golds[i] = self.moves[clas - 1]
            assert golds[i].id == clas
        cdef Move* guess
        while s.i < tokens.length:
            fill_context(self._context, s.i, tokens)
            self.extractor.extract(self._feats, self._values, self._context, NULL)
            self.model.score(self._scores, self._feats, self._values)
            set_accept_if_valid(self._moves, self.n_classes, s)
            guess = best_accepted(self._moves, self._scores, self.n_classes)
            set_accept_if_oracle(self._moves, golds, self.n_classes, s) # TODO
            gold = best_accepted(self._moves, self._scores, self.n_classes)
            if guess.clas == gold.clas:
                self.model.update({})
                return 0
            counts = {guess.clas: {}, gold.clas: {}}
            self.extractor.count(counts[gold.clas], self._feats, 1)
            self.extractor.count(counts[guess.clas], self._feats, -1)
            self.model.update(counts)
            transition(s, guess)
            tokens.ner[s.i-1] = s.tags[s.i-1]
    cpdef int set_tags(self, Tokens tokens) except -1:
        cdef Pool mem = Pool()
        cdef State* s = init_state(mem, tokens.length)
        cdef Move* move
        while s.i < tokens.length:
            fill_context(self._context, s.i, tokens)
            self.extractor.extract(self._feats, self._values, self._context, NULL)
            self.model.score(self._scores, self._feats, self._values)
            set_accept_if_valid(self._moves, self.n_classes, s)
            move = best_accepted(self._moves, self._scores, self.n_classes)
            transition(s, move)
            tokens.ner[s.i-1] = s.tags[s.i-1]
--- a/spacy/ner/io_moves.pxd
+++ b/spacy/ner/io_moves.pxd
@ -1,26 +0,0 @@
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport class_t
 from thinc.typedefs cimport weight_t
 from .structs cimport State, Move
 cpdef enum ActionType:
    MISSING
    SHIFT
    REDUCE
    OUT
    N_ACTIONS
 cdef int set_accept_if_oracle(Move* moves, int n, State* s,
                              int* g_starts, int* g_ends, int* g_labels) except 0
 cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
 cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
 cdef int transition(State *s, Move* m) except -1
 cdef int fill_moves(Move* moves, int n, list entity_types) except -1
--- a/spacy/ner/io_moves.pyx
+++ b/spacy/ner/io_moves.pyx
@ -1,161 +0,0 @@
 from __future__ import unicode_literals
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport class_t
 from thinc.typedefs cimport weight_t
 from ._state cimport begin_entity
 from ._state cimport end_entity
 from ._state cimport entity_is_open
 ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
 ACTION_NAMES[<int>MISSING] = '?'
 ACTION_NAMES[<int>SHIFT] = 'S'
 ACTION_NAMES[<int>REDUCE] = 'R'
 ACTION_NAMES[<int>OUT] = 'O'
 cdef int set_accept_if_oracle(Move* moves, int n, State* s,
                              int* g_starts, int* g_ends, int* g_labels) except 0:
    # If curr entity: (O invalid)
    #   if cost is not sunk (start matches, end is i-1 or greater
    #     - If i-1 == gold.end --> R=True, S=False
    #     - Shift if end >= i --> S=True, R=False
    #   else
    #     - If i == gold.start --> R=True, S=False
    #     - Else --> R=True, S=True
    # Else (R invalid):
    #   if start == gold.start: S=True, O=False
    #   else: O=True, S=False
    if entity_is_open(s):
        g_start = g_starts[s.curr.start]
        g_end = g_ends[s.curr.start]
        accept_o = False
        if g_start == s.curr.start and g_end == s.i:
            accept_r = True
            accept_s = False
        elif g_start == s.curr.start and g_end > s.i:
            accept_s = True
            s_label = s.curr.label
            accept_r = False
        elif g_starts[s.i] == s.i:
            accept_r = True
            accept_s = False
        else:
            accept_r = True
            accept_s = True
            s_label = s.curr.label
    else:
        accept_r = False
        if g_starts[s.i] == s.i:
            accept_s = True
            s_label = g_labels[s.i]
            accept_o = False
        else:
            accept_o = True
            accept_s = False
    n_accept = 0
    moves[0].accept = False
    for i in range(1, n):
        m = &moves[i]
        if m.action == SHIFT:
            m.accept = accept_s and m.label == s_label
        elif m.action == REDUCE:
            m.accept = accept_r
        elif m.action == OUT:
            m.accept = accept_o
        n_accept += m.accept
    assert n_accept != 0
    return n_accept
 cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0:
    cdef int i
    cdef bint open_ent = entity_is_open(s)
    cdef int n_accept = 0
    moves[0].accept = False
    for i in range(1, n):
        if moves[i].action == SHIFT:
            if s.i >= s.length:
                moves[i].accept = False
            elif open_ent and moves[i].label != s.curr.label:
                moves[i].accept = False
            else:
                moves[i].accept = True
        elif moves[i].action == REDUCE:
            moves[i].accept = open_ent
        elif moves[i].action == OUT:
            moves[i].accept = s.i < s.length and not open_ent
        n_accept += moves[i].accept
    return n_accept
 cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
    cdef int first_accept = -1
    for first_accept in range(1, n):
        if moves[first_accept].accept:
            break
    else:
        raise StandardError
    assert first_accept != -1
    cdef int best = first_accept
    cdef weight_t score = scores[first_accept-1]
    cdef int i
    for i in range(first_accept+1, n): 
        if moves[i].accept and scores[i-1] > score:
            best = i
            score = scores[i-1]
    return &moves[best]
 cdef int transition(State *s, Move* move) except -1:
    s.tags[s.i] = move.clas 
    if move.action == OUT:
        s.i += 1
    elif move.action == SHIFT:
        if not entity_is_open(s):
            s.curr.start = s.i
            s.curr.label = move.label
        s.i += 1
    elif move.action == REDUCE:
        s.curr.end = s.i
        s.ents[s.j] = s.curr
        s.j += 1
        s.curr.start = 0
        s.curr.label = -1
        s.curr.end = 0
    else:
        raise ValueError(move.action)
 def get_n_moves(n_tags):
    return 1 + 1 + 1 + n_tags
 cdef int fill_moves(Move* moves, int n, list entity_types) except -1:
    cdef Move* m
    label_names = {'-': 0}
    # Reserve class 0
    cdef int i = 0
    moves[i].clas = i
    moves[i].action = MISSING
    moves[i].label = 0
    i += 1
    for entity_type in entity_types:
        moves[i].action = SHIFT
        moves[i].label = label_names.setdefault(entity_type, len(label_names))
        moves[i].clas = i
        i += 1
    moves[i].clas = i
    moves[i].action = OUT
    moves[i].label = 0
    i += 1
    moves[i].action = REDUCE
    moves[i].clas = i
    moves[i].label = 0
    i += 1
 cdef bint is_final(State* s):
    return s.i == s.length and not entity_is_open(s)
--- a/spacy/ner/pystate.pxd
+++ b/spacy/ner/pystate.pxd
@ -1,16 +0,0 @@
 from cymem.cymem cimport Pool
 from .structs cimport Move, State
 cdef class PyState:
    cdef Pool mem
    cdef readonly list tag_names
    cdef readonly int n_classes
    cdef readonly dict moves_by_name
    cdef Move* _moves
    cdef Move* _golds
    cdef State* _s
    cdef Move* _get_move(self, unicode move_name) except NULL
--- a/spacy/ner/pystate.pyx
+++ b/spacy/ner/pystate.pyx
@ -1,60 +0,0 @@
 from __future__ import unicode_literals
 from ._state cimport init_state
 from ._state cimport entity_is_open
 from .bilou_moves cimport fill_moves
 from .bilou_moves cimport transition
 from .bilou_moves cimport set_accept_if_valid, set_accept_if_oracle
 from .bilou_moves import get_n_moves
 from .bilou_moves import ACTION_NAMES
 cdef class PyState:
    def __init__(self, tag_names, n_tokens):
        self.mem = Pool()
        self.tag_names = tag_names
        self.n_classes = len(tag_names)
        assert self.n_classes != 0
        self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
        fill_moves(self._moves, tag_names)
        self._s = init_state(self.mem, n_tokens)
        self._golds = <Move*>self.mem.alloc(n_tokens, sizeof(Move))
    cdef Move* _get_move(self, unicode move_name) except NULL:
        return &self._moves[self.tag_names.index(move_name)]
    def set_golds(self, list gold_names):
        cdef Move* m
        for i, name in enumerate(gold_names):
            m = self._get_move(name)
            self._golds[i] = m[0]
    def transition(self, unicode move_name):
        cdef Move* m = self._get_move(move_name)
        transition(self._s, m)
    def is_valid(self, unicode move_name):
        cdef Move* m = self._get_move(move_name)
        set_accept_if_valid(self._moves, self.n_classes, self._s)
        return m.accept
    def is_gold(self, unicode move_name):
        cdef Move* m = self._get_move(move_name)
        set_accept_if_oracle(self._moves, self._golds, self.n_classes, self._s)
        return m.accept
    property ent:
        def __get__(self):
            return self._s.curr
    property n_ents:
        def __get__(self):
            return self._s.j
    property i:
        def __get__(self):
            return self._s.i
    property open_entity:
        def __get__(self):
            return entity_is_open(self._s)
--- a/spacy/ner/structs.pxd
+++ b/spacy/ner/structs.pxd
@ -1,23 +0,0 @@
 from thinc.typedefs cimport class_t
 cdef struct Entity:
    int start
    int end
    int label
 cdef struct State:
    Entity curr
    Entity* ents
    int* tags
    int i
    int j
    int length
 cdef struct Move:
    class_t clas
    int action
    int label
    bint accept
--- a/spacy/pos_feats.pxd
+++ b/spacy/pos_feats.pxd
--- a/spacy/pos_feats.pyx
+++ b/spacy/pos_feats.pyx
@ -1,41 +0,0 @@
 from spacy.context cimport FIELD_IDS, Token
 cpdef Token P2 = FIELD_IDS.P2
 cpdef Token P1 = FIELD_IDS.P1
 cpdef Token N0 = FIELD_IDS.N0
 cpdef Token N1 = FIELD_IDS.N1
 cpdef Token N2 = FIELD_IDS.N2
 TEMPLATES = (
    (N0.sic,),
    (N0.norm,),
    (N0.suffix,),
    (N0.prefix,),
    (P1.pos,),
    (P2.pos,),
    (P1.pos, P2.pos),
    (P1.pos, N0.norm),
    (P1.norm,),
    (P1.suffix,),
    (P2.norm,),
    (N1.norm,),
    (N1.suffix,),
    (N2.norm,),
    (N0.shape,),
    (N0.cluster,),
    (N1.cluster,),
    (N2.cluster,),
    (P1.cluster,),
    (P2.cluster,),
    (N0.oft_upper,),
    (N0.oft_title,),
    (N0.postype,),
    (P1.like_url,),
    (N1.like_number,),
    (N1.like_url,),
 )
--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@ -1,153 +0,0 @@
 from __future__ import unicode_literals
 from . import util
 from . import tokens
 from .en import EN
 def read_gold(file_, tag_list, col):
    paras = file_.read().strip().split('\n\n')
    golds = []
    tag_ids = dict((tag, i) for i, tag in enumerate(tag_list))
    for para in paras:
        if not para.strip():
            continue
        lines = para.strip().split('\n')
        raw = lines.pop(0)
        gold_toks = lines.pop(0)
        tokens = EN.tokenize(raw)
        tags = []
        conll_toks = []
        for line in lines:
            pieces = line.split()
            conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[col]))
        for i, token in enumerate(tokens):
            if not conll_toks:
                tags.append('NULL')
            elif token.idx == conll_toks[0][0]:
                tags.append(conll_toks[0][2])
                conll_toks.pop(0)
            elif token.idx < conll_toks[0]:
                tags.append('NULL')
            else:
                conll_toks.pop(0)
        assert len(tags) == len(tokens)
        tags = [_encode_pos(t, tag_ids, tag_list) for t in tags]
        golds.append((tokens, tags))
    return golds
 def _encode_pos(tag, tag_ids, tag_list):
    if tag == '-':
        return 0
    if tag not in tag_ids:
        tag_ids[tag] = len(tag_list)
        tag_list.append(tag)
    return tag_ids[tag]
 def ptb_to_univ(tag):
    mapping = dict(tuple(line.split()) for line in """
 NULL    NULL
 HYPH   .
 ADD X
 NFP .
 AFX X
 XX  X
 BES VERB
 HVS VERB
 GW  X
 !	.
 #	.
 $	.
 ''	.
 (	.
 )	.
 ,	.
 -LRB-	.
 -RRB-	.
 .	.
 :	.
 ?	.
 CC	CONJ
 CD	NUM
 CD|RB	X
 DT	DET
 EX	DET
 FW	X
 IN	ADP
 IN|RP	ADP
 JJ	ADJ
 JJR	ADJ
 JJRJR	ADJ
 JJS	ADJ
 JJ|RB	ADJ
 JJ|VBG	ADJ
 LS	X
 MD	VERB
 NN	NOUN
 NNP	NOUN
 NNPS	NOUN
 NNS	NOUN
 NN|NNS	NOUN
 NN|SYM	NOUN
 NN|VBG	NOUN
 NP	NOUN
 PDT	DET
 POS	PRT
 PRP	PRON
 PRP$	PRON
 PRP|VBP	PRON
 PRT	PRT
 RB	ADV
 RBR	ADV
 RBS	ADV
 RB|RP	ADV
 RB|VBG	ADV
 RN	X
 RP	PRT
 SYM	X
 TO	PRT
 UH	X
 VB	VERB
 VBD	VERB
 VBD|VBN	VERB
 VBG	VERB
 VBG|NN	VERB
 VBN	VERB
 VBP	VERB
 VBP|TO	VERB
 VBZ	VERB
 VP	VERB
 WDT	DET
 WH	X
 WP	PRON
 WP$	PRON
 WRB	ADV
 !	PRT
 #	X
 $	NUM
 &	CONJ
 ,	.
@	X
 A	ADJ
 D	DET
 E	X
 G	X
 L	PRT
 M	PRT
 N	NOUN
 O	PRON
 P	ADP
 R	ADV
 S	NOUN
 T	PRT
 U	X
 V	VERB
 X	PRT
 Y	PRT
 Z	NOUN
 ^	NOUN
 ~	X
 ``	.
 EOL EOL""".strip().split('\n'))
    return mapping[tag]
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -8,7 +8,7 @@ from .structs cimport Utf8Str, UniStr
 cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
    s.chars = &chars[start]
    s.n = end - start
-    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
+    s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
 cdef class StringStore:
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -1,6 +1,6 @@
 from libc.stdint cimport uint8_t, uint32_t
-from .typedefs cimport flags_t, attr_t, id_t, hash_t
+from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t
 cdef struct Lexeme:
@ -34,7 +34,7 @@ cdef struct Morphology:
 cdef struct PosTag:
    Morphology morph
    int id
-    int pos
+    univ_tag_t pos
 cdef struct TokenC:
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -2,7 +2,7 @@ from libc.stdint cimport uint32_t
 from cymem.cymem cimport Pool
-from ..tokens cimport TokenC
+from ..structs cimport TokenC
 cdef struct State:
@ -20,7 +20,8 @@ cdef int pop_stack(State *s) except -1
 cdef int push_stack(State *s) except -1
-cdef bint has_head(const TokenC* t) nogil
+cdef inline bint has_head(const TokenC* t) nogil:
    return t.head != 0
 cdef inline int get_idx(const State* s, const TokenC* t) nogil:
@ -70,29 +71,14 @@ cdef inline bint is_final(const State *s) nogil:
    return at_eol(s) # The stack will be attached to root anyway
-cdef int children_in_buffer(const State *s, const int head, const int* gold) except -1
+cdef int children_in_buffer(const State *s, const int head, int* gold) except -1
-cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1
+cdef int head_in_buffer(const State *s, const int child, int* gold) except -1
-cdef int children_in_stack(const State *s, const int head, const int* gold) except -1
+cdef int children_in_stack(const State *s, const int head, int* gold) except -1
-cdef int head_in_stack(const State *s, const int child, const int* gold) except -1
+cdef int head_in_stack(const State *s, const int child, int* gold) except -1
 cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL
 cdef int count_left_kids(const TokenC* head) nogil
 cdef int count_right_kids(const TokenC* head) nogil
 # From https://en.wikipedia.org/wiki/Hamming_weight
 cdef inline uint32_t _popcount(uint32_t x) nogil:
    """Find number of non-zero bits."""
    cdef int count = 0
    while x != 0:
        x &= x - 1
        count += 1
    return count
 cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
    cdef int i
--- a/spacy/syntax/_state.pyx
+++ b/spacy/syntax/_state.pyx
@ -3,32 +3,24 @@ from libc.string cimport memmove
 from cymem.cymem cimport Pool
 from ..lexeme cimport EMPTY_LEXEME
 from ..tokens cimport TokenC
 DEF PADDING = 5
 DEF NON_MONOTONIC = True
 cdef int add_dep(State *s, int head, int child, int label) except -1:
-    cdef int dist = head - child
+    s.sent[child].head = head - child
    s.sent[child].head = dist
    s.sent[child].dep_tag = label
    # Keep a bit-vector tracking child dependencies.  If a word has a child at
    # offset i from it, set that bit (tracking left and right separately)
    if child > head:
-        s.sent[head].r_kids |= 1 << (-dist)
+        s.sent[head].r_kids |= 1 << (-s.sent[child].head)
    else:
-        s.sent[head].l_kids |= 1 << dist
+        s.sent[head].l_kids |= 1 << s.sent[child].head
 cdef int pop_stack(State *s) except -1:
    assert s.stack_len >= 1
    s.stack_len -= 1
    s.stack -= 1
-    if s.stack_len == 0 and not at_eol(s):
+
        push_stack(s)
 cdef int push_stack(State *s) except -1:
    assert s.i < s.sent_len
@ -36,14 +28,9 @@ cdef int push_stack(State *s) except -1:
    s.stack[0] = s.i
    s.stack_len += 1
    s.i += 1
    if at_eol(s):
        while s.stack_len != 0:
            if not has_head(get_s0(s)):
                get_s0(s).dep_tag = 0
            pop_stack(s)
-cdef int children_in_buffer(const State *s, int head, const int* gold) except -1:
+cdef int children_in_buffer(const State *s, int head, int* gold) except -1:
    # Golds holds an array of head offsets --- the head of word i is i - golds[i]
    # Iterate over the tokens of the queue, and check whether their gold head is
    # our target
@ -55,21 +42,20 @@ cdef int children_in_buffer(const State *s, int head, const int* gold) except -1
    return n
-cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1:
+cdef int head_in_buffer(const State *s, const int child, int* gold) except -1:
    return gold[child] >= s.i
-cdef int children_in_stack(const State *s, const int head, const int* gold) except -1:
+cdef int children_in_stack(const State *s, const int head, int* gold) except -1:
    cdef int i
    cdef int n = 0
    for i in range(s.stack_len):
        if gold[s.stack[-i]] == head:
-            if NON_MONOTONIC or not has_head(get_s0(s)):
+            n += 1
                n += 1
    return n
-cdef int head_in_stack(const State *s, const int child, const int* gold) except -1:
+cdef int head_in_stack(const State *s, const int child, int* gold) except -1:
    cdef int i
    for i in range(s.stack_len):
        if gold[child] == s.stack[-i]:
@ -86,7 +72,7 @@ cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) n
    if child >= s.sent:
        return child
    else:
-        return NULL
+        return s.sent - 1
 cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil:
@ -98,20 +84,10 @@ cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx)
    if child < (s.sent + s.sent_len):
        return child
    else:
-        return NULL
+        return s.sent - 1
-cdef bint has_head(const TokenC* t) nogil:
+DEF PADDING = 5
    return t.head != 0
 cdef int count_left_kids(const TokenC* head) nogil:
    return _popcount(head.l_kids)
 cdef int count_right_kids(const TokenC* head) nogil:
    return _popcount(head.r_kids)
 cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL:
@ -126,5 +102,4 @@ cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NUL
    s.stack_len = 0
    s.i = 0
    s.sent_len = sent_length
    push_stack(s)
    return s
--- a/spacy/syntax/arc_eager.pxd
+++ b/spacy/syntax/arc_eager.pxd
@ -7,11 +7,8 @@ from ._state cimport State
 cdef struct Transition:
    int clas
    int move
    int label
    int cost
    weight_t score
 cdef class TransitionSystem:
@ -21,8 +18,7 @@ cdef class TransitionSystem:
    cdef const Transition* _moves
-    cdef Transition best_valid(self, const weight_t* scores, const State* s) except *
+    cdef Transition best_valid(self, const weight_t* scores, const State* s) except -1
-    cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
+    cdef Transition best_gold(self, const weight_t* scores, const State* s,
-                              const State* s,
+                              int* gold_heads, int* gold_labels) except -1
                              const int* gold_heads, const int* gold_labels) except *
    cdef int transition(self, State *s, const Transition* t) except -1
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -7,8 +7,6 @@ from ._state cimport head_in_stack, children_in_stack
 from ..tokens cimport TokenC
 DEF NON_MONOTONIC = True
 cdef enum:
    SHIFT
@ -27,30 +25,22 @@ cdef inline bint _can_right(const State* s) nogil:
 cdef inline bint _can_left(const State* s) nogil:
-    if NON_MONOTONIC:
+    return s.stack_len >= 1 and not has_head(get_s0(s))
        return s.stack_len >= 1
    else:
        return s.stack_len >= 1 and not has_head(get_s0(s))
 cdef inline bint _can_reduce(const State* s) nogil:
-    if NON_MONOTONIC:
+    return s.stack_len >= 2 and has_head(get_s0(s))
        return s.stack_len >= 2
    else:
        return s.stack_len >= 2 and has_head(get_s0(s))
-cdef int _shift_cost(const State* s, const int* gold) except -1:
+cdef int _shift_cost(const State* s, int* gold) except -1:
    assert not at_eol(s)
    cost = 0
    cost += head_in_stack(s, s.i, gold)
    cost += children_in_stack(s, s.i, gold)
    if NON_MONOTONIC:
        cost += gold[s.stack[0]] == s.i
    return cost
-cdef int _right_cost(const State* s, const int* gold) except -1:
+cdef int _right_cost(const State* s, int* gold) except -1:
    assert s.stack_len >= 1
    cost = 0
    if gold[s.i] == s.stack[0]:
@ -58,12 +48,10 @@ cdef int _right_cost(const State* s, const int* gold) except -1:
    cost += head_in_buffer(s, s.i, gold)
    cost += children_in_stack(s, s.i, gold)
    cost += head_in_stack(s, s.i, gold)
    if NON_MONOTONIC:
        cost += gold[s.stack[0]] == s.i
    return cost
-cdef int _left_cost(const State* s, const int* gold) except -1:
+cdef int _left_cost(const State* s, int* gold) except -1:
    assert s.stack_len >= 1
    cost = 0
    if gold[s.stack[0]] == s.i:
@ -71,17 +59,11 @@ cdef int _left_cost(const State* s, const int* gold) except -1:
    cost += head_in_buffer(s, s.stack[0], gold)
    cost += children_in_buffer(s, s.stack[0], gold)
    if NON_MONOTONIC and s.stack_len >= 2:
        cost += gold[s.stack[0]] == s.stack[-1]
    return cost
-cdef int _reduce_cost(const State* s, const int* gold) except -1:
+cdef int _reduce_cost(const State* s, int* gold) except -1:
-    cdef int cost = 0
+    return children_in_buffer(s, s.stack[0], gold)
    cost += children_in_buffer(s, s.stack[0], gold)
    if NON_MONOTONIC:
        cost += head_in_buffer(s, s.stack[0], gold)
    return cost
 cdef class TransitionSystem:
@ -91,40 +73,38 @@ cdef class TransitionSystem:
        right_labels.sort()
        if 'ROOT' in right_labels:
            right_labels.pop(right_labels.index('ROOT'))
        if 'dep' in right_labels:
            right_labels.pop(right_labels.index('dep'))
        if 'ROOT' in left_labels:
            left_labels.pop(left_labels.index('ROOT'))
        if 'dep' in left_labels:
            left_labels.pop(left_labels.index('dep'))
        self.n_moves = 2 + len(left_labels) + len(right_labels) 
        moves = <Transition*>self.mem.alloc(self.n_moves, sizeof(Transition))
        cdef int i = 0
        moves[i].move = SHIFT
        moves[i].label = 0
        moves[i].clas = i
        i += 1
        moves[i].move = REDUCE
        moves[i].label = 0
        moves[i].clas = i
        i += 1
-        self.label_ids = {'ROOT': 0}
+        self.label_ids = {'ROOT': 0, 'dep': -1}
        cdef int label_id
        for label_str in left_labels:
            label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
            moves[i].move = LEFT
            moves[i].label = label_id
            moves[i].clas = i
            i += 1
        for label_str in right_labels:
            label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
            moves[i].move = RIGHT
            moves[i].label = label_id
            moves[i].clas = i
            i += 1
        self._moves = moves
-    cdef int transition(self, State *s, const Transition* t) except -1:
+    cdef int transition(self, State *s, const int clas) except -1:
        cdef const Transition* t = &self._moves[clas]
        if t.move == SHIFT:
            # Set the dep label, in case we need it after we reduce
            if NON_MONOTONIC:
                get_s0(s).dep_tag = t.label
            push_stack(s)
        elif t.move == LEFT:
            add_dep(s, s.i, s.stack[0], t.label)
@ -133,12 +113,11 @@ cdef class TransitionSystem:
            add_dep(s, s.stack[0], s.i, t.label)
            push_stack(s)
        elif t.move == REDUCE:
            add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep_tag)
            pop_stack(s)
        else:
            raise StandardError(t.move)
-    cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
+    cdef int best_valid(self, const weight_t* scores, const State* s) except -1:
        cdef bint[N_MOVES] valid
        valid[SHIFT] = _can_shift(s)
        valid[LEFT] = _can_left(s)
@ -147,61 +126,59 @@ cdef class TransitionSystem:
        cdef int best = -1
        cdef weight_t score = 0
        cdef weight_t best_r_score = -9000
        cdef int best_r_label = -1
        cdef int i
        for i in range(self.n_moves):
            if valid[self._moves[i].move] and (best == -1 or scores[i] > score):
                best = i
                score = scores[i]
            if self._moves[i].move == RIGHT and scores[i] > best_r_score:
                best_r_label = self._moves[i].label
        assert best >= 0
-        cdef Transition t = self._moves[best]
+        return best
        t.score = score
        if t.move == SHIFT:
            t.label = best_r_label
        return t
-    cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
+    cdef int best_gold(self, const weight_t* scores, const State* s,
-                              const State* s,
+                       int* gold_heads, int* gold_labels) except -1:
                              const int* gold_heads, const int* gold_labels) except *:
        # If we can create a gold dependency, only one action can be correct
        cdef int[N_MOVES] unl_costs
        unl_costs[SHIFT] = _shift_cost(s, gold_heads) if _can_shift(s) else -1
        unl_costs[LEFT] = _left_cost(s, gold_heads) if _can_left(s) else -1
        unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1
        unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1
-        guess.cost = unl_costs[guess.move]
+        cdef int cost
-        cdef Transition t
+        cdef int move
-        cdef int target_label
+        cdef int label
        cdef int i
        if gold_heads[s.stack[0]] == s.i:
            target_label = gold_labels[s.stack[0]]
            if guess.move == LEFT:
                guess.cost += guess.label != target_label
            for i in range(self.n_moves):
                t = self._moves[i]
                if t.move == LEFT and t.label == target_label:
                    return t
        elif gold_heads[s.i] == s.stack[0]:
            target_label = gold_labels[s.i]
            if guess.move == RIGHT:
                guess.cost += guess.label != target_label
            for i in range(self.n_moves):
                t = self._moves[i]
                if t.move == RIGHT and t.label == target_label:
                    return t
        cdef int best = -1
        cdef weight_t score = -9000
        cdef int i
        for i in range(self.n_moves):
-            t = self._moves[i]
+            move = self._moves[i].move
-            if unl_costs[t.move] == 0 and (best == -1 or scores[i] > score):
+            label = self._moves[i].label
-                best = i
+            if unl_costs[move] == 0: 
-                score = scores[i]
+                if move == SHIFT or move == REDUCE:
-        t = self._moves[best]
+                    cost = 0
-        t.score = score
+                elif move == LEFT:
-        assert best >= 0
+                    if gold_heads[s.stack[0]] == s.i and gold_labels[s.stack[0]] != -1:
-        return t
+                        cost = label != gold_labels[s.stack[0]]
                    else:
                        cost = 0
                elif move == RIGHT:
                    if gold_heads[s.i] == s.stack[0] and gold_labels[s.i] != -1:
                        cost = label != gold_labels[s.i]
                    else:
                        cost = 0
                else:
                    raise StandardError("Unknown Move")
                if cost == 0 and (best == -1 or scores[i] > score):
                    best = i
                    score = scores[i]
        if best < 0:
            print unl_costs[SHIFT], unl_costs[REDUCE], unl_costs[LEFT], unl_costs[RIGHT]
            print s.stack_len
            print has_head(get_s0(s))
            print s.sent[s.stack[0]].head
            print s.stack[0], s.i
            print gold_heads[s.stack[0]], gold_heads[s.i]
            print gold_labels[s.i]
            print children_in_buffer(s, s.stack[0], gold_heads)
            print head_in_buffer(s, s.stack[0], gold_heads)
            raise StandardError 
        return best
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -2,6 +2,8 @@
 # cython: embedsignature=True
 from __future__ import unicode_literals
 from os import path
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
@ -28,6 +30,17 @@ cdef class Tokenizer:
        self.vocab = Vocab(self.get_props)
        self._load_special_tokenization(rules)
    @classmethod
    def from_dir(cls, Vocab vocab, object data_dir):
        if not path.exists(data_dir):
            raise IOError("Directory %s not found -- cannot load Tokenizer." % data_dir)
        if not path.isdir(data_dir):
            raise IOError("Path %s is a file, not a dir -- cannot load Tokenizer." % data_dir)
        assert path.exists(data_dir) and path.isdir(data_dir)
        rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir)
        return cls(vocab, rules, prefix_re, suffix_re, infix_re)
    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
        cdef Tokens tokens = Tokens(self.vocab.strings, length)
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -1,6 +1,26 @@
 from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
 from libc.stdint cimport uint8_t
 # Google universal tag set
 cpdef enum univ_tag_t:
    NO_TAG
    ADJ
    ADV
    ADP
    CONJ
    DET
    NOUN
    NUM
    PRON
    PRT
    VERB
    X
    PUNCT
    EOL
    N_UNIV_TAGS
 ctypedef uint64_t hash_t
 ctypedef char* utf8_t
 ctypedef uint32_t attr_t
@ -10,11 +30,3 @@ ctypedef uint16_t len_t
 ctypedef uint16_t tag_t
 cdef struct Morphology:
    uint8_t number
    uint8_t tenspect # Tense/aspect/voice
    uint8_t mood
    uint8_t gender
    uint8_t person
    uint8_t case
    uint8_t misc
--- a/spacy/utf8string.pxd
+++ b/spacy/utf8string.pxd
@ -1,34 +0,0 @@
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
 from .typedefs cimport utf8_t, id_t, hash_t
 cdef struct Utf8Str:
    id_t i
    hash_t key
    utf8_t chars
    int length
 cdef struct UniStr:
    Py_UNICODE* chars
    size_t n
    hash_t key
 cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
    s.chars = &chars[start]
    s.n = end - start
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
 cdef class StringStore:
    cdef Pool mem
    cdef PreshMap _map
    cdef Utf8Str* strings
    cdef int size
    cdef int _resize_at
    cdef const Utf8Str* intern(self, char* chars, int length) except NULL
--- a/spacy/utf8string.pyx
+++ b/spacy/utf8string.pyx
@ -1,80 +0,0 @@
 from libc.string cimport memcpy
 from murmurhash.mrmr cimport hash64
 import codecs
 SEPARATOR = '\n|-SEP-|\n'
 cdef class StringStore:
    def __init__(self):
        self.mem = Pool()
        self._map = PreshMap()
        self._resize_at = 10000
        self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
        self.size = 1
    property size:
        def __get__(self):
            return self.size-1
    def __getitem__(self, object string_or_id):
        cdef bytes byte_string
        cdef const Utf8Str* utf8str
        if isinstance(string_or_id, int) or isinstance(string_or_id, long):
            if string_or_id < 1 or string_or_id >= self.size:
                raise IndexError(string_or_id)
            utf8str = &self.strings[<int>string_or_id]
            return utf8str.chars[:utf8str.length]
        elif isinstance(string_or_id, bytes):
            utf8str = self.intern(<char*>string_or_id, len(string_or_id))
            return utf8str.i
        elif isinstance(string_or_id, unicode):
            byte_string = string_or_id.encode('utf8')
            utf8str = self.intern(<char*>byte_string, len(byte_string))
            return utf8str.i
        else:
            raise TypeError(type(string_or_id))
    cdef const Utf8Str* intern(self, char* chars, int length) except NULL:
        # 0 means missing, but we don't bother offsetting the index. We waste
        # slot 0 to simplify the code, because it doesn't matter.
        assert length != 0
        cdef hash_t key = hash64(chars, length * sizeof(char), 0)
        cdef void* value = self._map.get(key)
        cdef size_t i
        if value == NULL:
            if self.size == self._resize_at:
                self._resize_at *= 2
                self.strings = <Utf8Str*>self.mem.realloc(self.strings, self._resize_at * sizeof(Utf8Str))
            i = self.size
            self.strings[i].i = self.size
            self.strings[i].key = key
            self.strings[i].chars = <char*>self.mem.alloc(length, sizeof(char))
            memcpy(self.strings[i].chars, chars, length)
            self.strings[i].length = length
            self._map.set(key, <void*>self.size)
            self.size += 1
        else:
            i = <size_t>value
        return &self.strings[i]
    def dump(self, loc):
        strings = []
        cdef Utf8Str* string
        cdef bytes py_string
        for i in range(self.size):
            string = &self.strings[i]
            py_string = string.chars[:string.length]
            strings.append(py_string.decode('utf8'))
        with codecs.open(loc, 'w', 'utf8') as file_:
            file_.write(SEPARATOR.join(strings))
    def load(self, loc):
        with codecs.open(loc, 'r', 'utf8') as file_:
            strings = file_.read().split(SEPARATOR)
        cdef unicode string
        cdef bytes byte_string
        for string in strings[1:]:
            byte_string = string.encode('utf8')
            self.intern(byte_string, len(byte_string))
--- a/spacy/util.py
+++ b/spacy/util.py
@ -11,8 +11,7 @@ def utf8open(loc, mode='r'):
    return codecs.open(loc, mode, 'utf8')
-def read_lang_data(name):
+def read_lang_data(data_dir):
    data_dir = path.join(DATA_DIR, name)
    with open(path.join(data_dir, 'specials.json')) as file_:
        tokenization = ujson.load(file_)
    prefix = read_prefix(data_dir)
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -19,6 +19,17 @@ cdef class Vocab:
        self.lexemes.push_back(&EMPTY_LEXEME)
        self.get_lex_props = get_props
    @classmethod
    def from_dir(cls, object data_dir, object get_lex_props=None):
        if not path.exists(data_dir):
            raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
        if not path.isdir(data_dir):
            raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
        cdef Vocab self = cls(get_props)
        self.strings.load(path.join(data_dir, 'strings'))
        self.load(path.join(data_dir, 'lexemes'))
        return self
    def __len__(self):
        return self.lexemes.size()