* Complete refactor of Tagger features, to use a generic list of context names.

2025-12-07 10:14:22 +03:00 · 2014-11-05 20:45:29 +11:00 · 2014-11-05 20:45:29 +11:00 · 4ecbe8c893
commit 4ecbe8c893
parent 0a8c84625d
14 changed files with 166 additions and 450 deletions
--- a/spacy/context.pxd
+++ b/spacy/context.pxd
@ -4,40 +4,42 @@ from .tokens cimport Tokens
 from .lexeme cimport Lexeme
-cdef struct Token:
+cdef class Token:
-    atom_t i
+    cdef readonly atom_t i
-    atom_t c
+    cdef readonly atom_t c
-    atom_t w
+    cdef readonly atom_t w
-    atom_t shape
+    cdef readonly atom_t shape
-    atom_t pref
+    cdef readonly atom_t pref
-    atom_t suff
+    cdef readonly atom_t suff
-    atom_t oft_title
+    cdef readonly atom_t oft_title
-    atom_t oft_upper
+    cdef readonly atom_t oft_upper
-    atom_t is_alpha
+    cdef readonly atom_t is_alpha
-    atom_t is_digit
+    cdef readonly atom_t is_digit
-    atom_t is_title
+    cdef readonly atom_t is_title
-    atom_t is_upper
+    cdef readonly atom_t is_upper
-    atom_t url
+    cdef readonly atom_t url
-    atom_t num
+    cdef readonly atom_t num
-    atom_t postype
+    cdef readonly atom_t postype
-    atom_t pos
+    cdef readonly atom_t pos
-    atom_t ner
+    cdef readonly atom_t ner
-cdef struct Slots:
+cdef class Slots:
-    Token P2
+    cdef readonly Token P2
-    Token P1
+    cdef readonly Token P1
-    Token N0
+    cdef readonly Token N0
-    Token N1
+    cdef readonly Token N1
-    Token N2
+    cdef readonly Token N2
 cdef Slots FIELD_IDS
 cdef int N_FIELDS
-cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0
+cdef hash_t fill_slots(Slots s, int i, Tokens tokens) except 0
-cdef int fill_flat(atom_t* context, Slots* s) except -1
+cdef int fill_flat(atom_t* context, Slots s) except -1
 cpdef Slots FIELD_IDS
--- a/spacy/context.pyx
+++ b/spacy/context.pyx
@ -2,7 +2,16 @@ from murmurhash.mrmr cimport hash64
 from .lexeme cimport *
-cdef void _number_token(Token* t, int* n_fields):
+cdef class Slots:
    def __init__(self):
        self.P2 = Token()
        self.P1 = Token()
        self.N0 = Token()
        self.N1 = Token()
        self.N2 = Token()
 cdef void _number_token(Token t, int* n_fields):
    cdef int i = n_fields[0]
    t.i = i; i += 1
    t.c = i; i += 1
@ -27,7 +36,7 @@ cdef void _number_token(Token* t, int* n_fields):
    n_fields[0] = i
-cdef int fill_token(Token* t, Lexeme* lex, atom_t pos, atom_t ner):
+cdef int fill_token(Token t, Lexeme* lex, atom_t pos, atom_t ner):
    t.i = lex.sic
    t.c = lex.cluster
    t.w = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
@ -48,7 +57,7 @@ cdef int fill_token(Token* t, Lexeme* lex, atom_t pos, atom_t ner):
    t.ner = ner
-cdef int _flatten_token(atom_t* context, Token* ids, Token* vals) except -1:
+cdef int _flatten_token(atom_t* context, Token ids, Token vals) except -1:
    context[ids.i] = vals.i
    context[ids.c] = vals.c
    context[ids.w] = vals.w
@ -68,26 +77,27 @@ cdef int _flatten_token(atom_t* context, Token* ids, Token* vals) except -1:
    context[ids.ner] = vals.ner
-cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0:
+cdef hash_t fill_slots(Slots s, int i, Tokens tokens) except 0:
-    fill_token(&s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
+    fill_token(s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
-    fill_token(&s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
+    fill_token(s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
-    fill_token(&s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
+    fill_token(s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
-    fill_token(&s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
+    fill_token(s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
-    fill_token(&s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
+    fill_token(s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
-    return hash64(s, sizeof(Slots), 0)
+    return 1
-cdef int fill_flat(atom_t* context, Slots* s) except -1:
+cdef int fill_flat(atom_t* context, Slots s) except -1:
-    _flatten_token(context, &FIELD_IDS.P2, &s.P2)
+    _flatten_token(context, FIELD_IDS.P2, s.P2)
-    _flatten_token(context, &FIELD_IDS.P1, &s.P1)
+    _flatten_token(context, FIELD_IDS.P1, s.P1)
-    _flatten_token(context, &FIELD_IDS.N0, &s.N0)
+    _flatten_token(context, FIELD_IDS.N0, s.N0)
-    _flatten_token(context, &FIELD_IDS.N1, &s.N1)
+    _flatten_token(context, FIELD_IDS.N1, s.N1)
-    _flatten_token(context, &FIELD_IDS.N2, &s.N2)
+    _flatten_token(context, FIELD_IDS.N2, s.N2)
 N_FIELDS = 0
-_number_token(&FIELD_IDS.P2, &N_FIELDS)
+FIELD_IDS = Slots()
-_number_token(&FIELD_IDS.P1, &N_FIELDS)
+_number_token(FIELD_IDS.P2, &N_FIELDS)
-_number_token(&FIELD_IDS.N0, &N_FIELDS)
+_number_token(FIELD_IDS.P1, &N_FIELDS)
-_number_token(&FIELD_IDS.N1, &N_FIELDS)
+_number_token(FIELD_IDS.N0, &N_FIELDS)
-_number_token(&FIELD_IDS.N2, &N_FIELDS)
+_number_token(FIELD_IDS.N1, &N_FIELDS)
 _number_token(FIELD_IDS.N2, &N_FIELDS)
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -42,6 +42,7 @@ cdef class Language:
    cpdef readonly Lexicon lexicon
    cpdef readonly Tagger pos_tagger
    cpdef readonly Tagger ner_tagger
    cdef object _prefix_re
    cdef object _suffix_re
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -45,6 +45,8 @@ cdef class Language:
            self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
        else:
            self.pos_tagger = None
        if path.exists(path.join(util.DATA_DIR, name, 'ner')):
            self.ner_tagger = Tagger(path.join(util.DATA_DIR, name, 'ner'))
    cpdef Tokens tokenize(self, unicode string):
        """Tokenize a string.
--- a/spacy/ner_feats.pxd
+++ b/spacy/ner_feats.pxd
--- a/spacy/ner_feats.pyx
+++ b/spacy/ner_feats.pyx
@ -0,0 +1,35 @@
 from spacy.context cimport FIELD_IDS, Token
 cdef Token P2 = FIELD_IDS.P2
 cdef Token P1 = FIELD_IDS.P1
 cdef Token N0 = FIELD_IDS.N0
 cdef Token N1 = FIELD_IDS.N1
 cdef Token N2 = FIELD_IDS.N2
 TEMPLATES = (
    (N0.i,),
    (N0.c,),
    (P1.pos,),
    (P1.i,),
    (N1.w,),
    (N1.pos,),
    (P1.ner,),
    (P2.ner,),
    (N0.c,),
    (P1.c,),
    (N1.c,),
    (N0.is_alpha,),
    (N0.is_digit,),
    (N0.is_title,),
    (N0.is_upper,),
    (N0.is_title, N0.oft_title),
    (N0.is_upper, N0.oft_upper),
 )
--- a/spacy/pos.pyx
+++ b/spacy/pos.pyx
@ -1,229 +0,0 @@
 # cython: profile=True
 from os import path
 import os
 import shutil
 import ujson
 import random
 import codecs
 import gzip
 import cython
 from libc.stdint cimport uint32_t
 from thinc.weights cimport arg_max
 from thinc.features import NonZeroConjFeat
 from thinc.features import ConjFeat
 from .lexeme cimport *
 from .lang cimport Lexicon
 NULL_TAG = 0
 cdef class Tagger:
    tags = {'NULL': NULL_TAG}
    def __init__(self, model_dir):
        self.mem = Pool()
        tags_loc = path.join(model_dir, 'postags.json')
        if path.exists(tags_loc):
            with open(tags_loc) as file_:
                Tagger.tags.update(ujson.load(file_))
        self.model = LinearModel(len(self.tags))
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))
        self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])
        self._atoms = <atom_t*>self.mem.alloc(CONTEXT_SIZE, sizeof(atom_t))
        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
        self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))
        self._guess = NULL_TAG
    cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
        get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
                  tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev)
        self.extractor.extract(self._feats, self._values, self._atoms, NULL)
        self._guess = self.model.score(self._scores, self._feats, self._values)
        return self._guess
    cpdef bint tell_answer(self, class_t gold) except *:
        cdef class_t guess = self._guess
        if gold == guess or gold == NULL_TAG:
            self.model.update({})
            return 0
        counts = {guess: {}, gold: {}}
        self.extractor.count(counts[gold], self._feats, 1)
        self.extractor.count(counts[guess], self._feats, -1)
        self.model.update(counts)
    @classmethod
    def encode_pos(cls, tag):
        if tag not in cls.tags:
            cls.tags[tag] = len(cls.tags)
        return cls.tags[tag]
@cython.boundscheck(False)
 def count_tags(Tagger tagger, Tokens tokens, uint32_t[:, :] tag_counts):
    cdef class_t prev_prev, prev, tag
    prev = tagger.tags['EOL']; prev_prev = tagger.tags['EOL']
    cdef int i
    cdef id_t token
    for i in range(tokens.length):
        tag = tagger.predict(i, tokens, prev, prev_prev)
        prev_prev = prev
        prev = tag
        token = tokens.lex[i].id
        if token < tag_counts.shape[0]:
            tag_counts[token, tag] += 1
 cpdef enum:
    P2i
    P2c
    P2w
    P2shape
    P2pref
    P2suff
    P2title
    P2upper
    P2oft_title
    P2oft_upper
    P2pos
    P2url
    P2num
    P1i
    P1c
    P1w
    P1shape
    P1pre
    P1suff
    P1title
    P1upper
    P1oft_title
    P1oft_upper
    P1pos
    P1url
    P1num
    N0i
    N0c
    N0w
    N0shape
    N0pref
    N0suff
    N0title
    N0upper
    N0oft_title
    N0oft_upper
    N0pos
    N0url
    N0num
    N1i
    N1c
    N1w
    N1shape
    N1pref
    N1suff
    N1title
    N1upper
    N1oft_title
    N1oft_upper
    N1pos
    N1url
    N1num
    N2i
    N2c
    N2w
    N2shape
    N2pref
    N2suff
    N2title
    N2upper
    N2oft_title
    N2oft_upper
    N2pos
    N2url
    N2num
    P2t
    P1t
    CONTEXT_SIZE
 cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,
                   Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
    _fill_token(&atoms[P2i], p2)
    _fill_token(&atoms[P1i], p1)
    _fill_token(&atoms[N0i], n0)
    _fill_token(&atoms[N1i], n1)
    _fill_token(&atoms[N2i], n2)
    atoms[P1t] = prev_tag
    atoms[P2t] = prev_prev_tag
 cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
    atoms[0] = lex.sic
    atoms[1] = lex.cluster
    atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
    atoms[3] = lex.shape
    atoms[4] = lex.prefix
    atoms[5] = lex.suffix
    atoms[6] = lex.flags & (1 << IS_TITLE)
    atoms[7] = lex.flags & (1 << IS_UPPER)
    atoms[8] = lex.flags & (1 << OFT_TITLE)
    atoms[9] = lex.flags & (1 << OFT_UPPER)
    atoms[10] = lex.postype
    atoms[11] = lex.flags & (1 << LIKE_URL)
    atoms[12] = lex.flags & (1 << LIKE_NUMBER)
 TEMPLATES = (
    (N0i,),
    (N0w,),
    (N0suff,),
    (N0pref,),
    (P1t,),
    (P2t,),
    (P1t, P2t),
    (P1t, N0w),
    (P1w,),
    (P1suff,),
    (P2w,),
    (N1w,),
    (N1suff,),
    (N2w,),
    (N0shape,),
    (N0c,),
    (N1c,),
    (N2c,),
    (P1c,),
    (P2c,),
    (P1c, N0c),
    (N0c, N1c),
    (P1c, P1t),
    (P1c, P1t, N0c),
    (P1t, N0c),
    (N0oft_upper,),
    (N0oft_title,),
    (P1w, N0w),
    (N0w, N1w),
    (N0pos,),
    (P1t, N0pos, N1pos),
    (P1t, N1pos),
    (N0url,),
    (N0num,),
    (P1url,),
    (P1url,),
    (N1num,),
    (N1url,),
 )
--- a/spacy/pos_feats.pxd
+++ b/spacy/pos_feats.pxd
@ -1,83 +0,0 @@
 from .tokens cimport Tokens
 from thinc.typedefs cimport atom_t
 cpdef enum:
    P2i
    P2c
    P2w
    P2shape
    P2pref
    P2suff
    P2title
    P2upper
    P2oft_title
    P2oft_upper
    P2pos
    P2url
    P2num
    P1i
    P1c
    P1w
    P1shape
    P1pre
    P1suff
    P1title
    P1upper
    P1oft_title
    P1oft_upper
    P1pos
    P1url
    P1num
    N0i
    N0c
    N0w
    N0shape
    N0pref
    N0suff
    N0title
    N0upper
    N0oft_title
    N0oft_upper
    N0pos
    N0url
    N0num
    N1i
    N1c
    N1w
    N1shape
    N1pref
    N1suff
    N1title
    N1upper
    N1oft_title
    N1oft_upper
    N1pos
    N1url
    N1num
    N2i
    N2c
    N2w
    N2shape
    N2pref
    N2suff
    N2title
    N2upper
    N2oft_title
    N2oft_upper
    N2pos
    N2url
    N2num
    P2t
    P1t
    CONTEXT_SIZE
 cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1
--- a/spacy/pos_feats.pyx
+++ b/spacy/pos_feats.pyx
@ -1,77 +1,41 @@
-from .lexeme cimport *
+from spacy.context cimport FIELD_IDS, Token
-from thinc.typedefs cimport atom_t
+
 cpdef Token P2 = FIELD_IDS.P2
 cpdef Token P1 = FIELD_IDS.P1
 cpdef Token N0 = FIELD_IDS.N0
 cpdef Token N1 = FIELD_IDS.N1
 cpdef Token N2 = FIELD_IDS.N2
 TEMPLATES = (
-    (N0i,),
+    (N0.i,),
-    (N0w,),
+    (N0.w,),
-    (N0suff,),
+    (N0.suff,),
-    (N0pref,),
+    (N0.pref,),
-    (P1t,),
+    (P1.pos,),
-    (P2t,),
+    (P2.pos,),
-    (P1t, P2t),
+    (P1.pos, P2.pos),
-    (P1t, N0w),
+    (P1.pos, N0.w),
-    (P1w,),
+    (P1.w,),
-    (P1suff,),
+    (P1.suff,),
-    (P2w,),
+    (P2.w,),
-    (N1w,),
+    (N1.w,),
-    (N1suff,),
+    (N1.suff,),
-    (N2w,),
+    (N2.w,),
-    (N0shape,),
+    (N0.shape,),
-    (N0c,),
+    (N0.c,),
-    (N1c,),
+    (N1.c,),
-    (N2c,),
+    (N2.c,),
-    (P1c,),
+    (P1.c,),
-    (P2c,),
+    (P2.c,),
-    (P1c, N0c),
+    (N0.oft_upper,),
-    (N0c, N1c),
+    (N0.oft_title,),
    (P1c, P1t),
    (P1c, P1t, N0c),
    (P1t, N0c),
    (N0oft_upper,),
    (N0oft_title,),
-    (P1w, N0w),
+    (N0.postype,),
    (N0w, N1w),
-    (N0pos,),
+    (P1.url,),
-    (P1t, N0pos, N1pos),
+    (N1.num,),
-    (P1t, N1pos),
+    (N1.url,),
    (N0url,),
    (N0num,),
    (P1url,),
    (P1url,),
    (N1num,),
    (N1url,),
 )
 cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
    _fill_token(&context[P2i], tokens.lex[i-2])
    _fill_token(&context[P1i], tokens.lex[i-1])
    _fill_token(&context[N0i], tokens.lex[i])
    _fill_token(&context[N1i], tokens.lex[i+1])
    _fill_token(&context[N2i], tokens.lex[i+2])
    context[P1t] = tokens.pos[i-1]
    context[P2t] = tokens.pos[i-2]
 cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
    atoms[0] = lex.sic
    atoms[1] = lex.cluster
    atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
    atoms[3] = lex.shape
    atoms[4] = lex.prefix
    atoms[5] = lex.suffix
    atoms[6] = lex.flags & (1 << IS_TITLE)
    atoms[7] = lex.flags & (1 << IS_UPPER)
    atoms[8] = lex.flags & (1 << OFT_TITLE)
    atoms[9] = lex.flags & (1 << OFT_UPPER)
    atoms[10] = lex.postype
    atoms[11] = lex.flags & (1 << LIKE_URL)
    atoms[12] = lex.flags & (1 << LIKE_NUMBER)
--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@ -3,10 +3,8 @@ from . import util
 from . import tokens
 from .en import EN
 from .pos import Tagger
-
+def read_gold(file_, tag_list, col):
 def read_gold(file_, tag_list):
    paras = file_.read().strip().split('\n\n')
    golds = []
    tag_ids = dict((tag, i) for i, tag in enumerate(tag_list))
@ -21,7 +19,7 @@ def read_gold(file_, tag_list):
        conll_toks = []
        for line in lines:
            pieces = line.split()
-            conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[3]))
+            conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[col]))
        for i, token in enumerate(tokens):
            if not conll_toks:
                tags.append('NULL')
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -4,6 +4,8 @@ from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 from .typedefs cimport hash_t
 from .context cimport Slots
 from .tokens cimport Tokens
@ -26,7 +28,8 @@ cdef class Tagger:
    cpdef readonly list tag_names
    cdef class_t _guess
-    cdef atom_t* _context
+    cdef atom_t* _context_flat
    cdef Slots _context_slots
    cdef feat_t* _feats
    cdef weight_t* _values
    cdef weight_t* _scores
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -10,8 +10,9 @@ import random
 import json
 import cython
-from .pos_feats cimport fill_context as pos_fill_context
+from .context cimport fill_slots
-from .pos_feats cimport CONTEXT_SIZE as POS_CONTEXT_SIZE
+from .context cimport fill_flat
 from .context cimport N_FIELDS
 from thinc.features cimport ConjFeat
@ -46,6 +47,7 @@ def train(train_sents, model_dir, nr_iter=5):
                if gold != NULL_TAG:
                    total += 1
                    n_corr += guess == gold
                #print('%s\t%d\t%d' % (tokens[i].string, guess, gold))
        print('%.4f' % ((n_corr / total) * 100))
        random.shuffle(train_sents)
    tagger.model.end_training()
@ -76,15 +78,12 @@ cdef class Tagger:
        self.tag_names = cfg['tag_names']
        self.tag_type = cfg['tag_type']
        self.extractor = Extractor(templates, [ConjFeat] * len(templates))
-        self.model = LinearModel(len(self.tag_names), self.extractor.n)
+        self.model = LinearModel(len(self.tag_names))
        print("Load tagger model")
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))
        print("Done")
-        if self.tag_type == POS:
+        self._context_flat = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
-            n_context = POS_CONTEXT_SIZE
+        self._context_slots = Slots()
        self._context = <atom_t*>self.mem.alloc(n_context, sizeof(atom_t))
        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
        self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
@ -110,11 +109,9 @@ cdef class Tagger:
        >>> tag = EN.pos_tagger.predict(0, tokens)
        >>> assert tag == EN.pos_tagger.tag_id('DT') == 5
        """
-        if self.tag_type == POS:
+        cdef hash_t hashed = fill_slots(self._context_slots, i, tokens)
-            pos_fill_context(self._context, i, tokens)
+        fill_flat(self._context_flat, self._context_slots)
-        else:
+        self.extractor.extract(self._feats, self._values, self._context_flat, NULL)
            raise StandardError
        self.extractor.extract(self._feats, self._values, self._context, NULL)
        self._guess = self.model.score(self._scores, self._feats, self._values)
        return self._guess
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -15,9 +15,11 @@ cdef class Tokens:
    cdef Lexeme** _lex_ptr
    cdef int* _idx_ptr
    cdef int* _pos_ptr
    cdef int* _ner_ptr
    cdef Lexeme** lex
    cdef int* idx
    cdef int* pos
    cdef int* ner
    cdef int length
    cdef int max_length
@ -32,6 +34,7 @@ cdef class Token:
    cdef public int i
    cdef public int idx
    cdef public int pos
    cdef public int ner
    cdef public atom_t id
    cdef public atom_t cluster
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -1,6 +1,7 @@
 # cython: profile=True
 from .lexeme cimport *
 cimport cython
 from .tagger cimport POS, ENTITY
 DEF PADDING = 5
@ -44,21 +45,25 @@ cdef class Tokens:
        self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
        self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
        self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
        self._ner_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
        self.lex = self._lex_ptr
        self.idx = self._idx_ptr
        self.pos = self._pos_ptr
        self.ner = self._ner_ptr
        cdef int i
        for i in range(size + (PADDING*2)):
            self.lex[i] = &EMPTY_LEXEME
        self.lex += PADDING
        self.idx += PADDING
        self.pos += PADDING
        self.ner += PADDING
        self.max_length = size
        self.length = 0
    def __getitem__(self, i):
        bounds_check(i, self.length, PADDING)
-        return Token(self._string_store, i, self.idx[i], self.pos[i], self.lex[i][0])
+        return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i],
                     self.lex[i][0])
    def __iter__(self):
        for i in range(self.length):
@ -73,6 +78,7 @@ cdef class Tokens:
        self.lex[self.length] = lexeme
        self.idx[self.length] = idx
        self.pos[self.length] = 0
        self.ner[self.length] = 0
        self.length += 1
        return idx + lexeme.length
@ -91,7 +97,10 @@ cdef class Tokens:
        return idx
    cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
        if tag_type == POS:
            self.pos[i] = tag
        elif tag_type == ENTITY:
            self.ner[i] = tag
    def _realloc(self, new_size):
        self.max_length = new_size
@ -99,19 +108,23 @@ cdef class Tokens:
        self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
        self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
        self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
        self._ner_ptr = <int*>self.mem.realloc(self._ner_ptr, n * sizeof(int))
        self.lex = self._lex_ptr + PADDING
        self.idx = self._idx_ptr + PADDING
        self.pos = self._pos_ptr + PADDING
        self.ner = self._ner_ptr + PADDING
        for i in range(self.length, self.max_length + PADDING):
            self.lex[i] = &EMPTY_LEXEME
@cython.freelist(64)
 cdef class Token:
-    def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex):
+    def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
                 dict lex):
        self._string_store = string_store
        self.idx = idx
        self.pos = pos
        self.ner = ner
        self.i = i
        self.id = lex['id']