* Complete refactor of Tagger features, to use a generic list of context names.

2025-08-24 14:04:56 +03:00 · 2014-11-05 20:45:29 +11:00 · 2014-11-05 20:45:29 +11:00 · 4ecbe8c893
commit 4ecbe8c893
parent 0a8c84625d
14 changed files with 166 additions and 450 deletions
--- a/spacy/context.pxd
+++ b/spacy/context.pxd
@ -4,40 +4,42 @@ from .tokens cimport Tokens
 from .lexeme cimport Lexeme


-cdef struct Token:
-    atom_t i
-    atom_t c
-    atom_t w
-    atom_t shape
-    atom_t pref
-    atom_t suff
-    atom_t oft_title
-    atom_t oft_upper
-    atom_t is_alpha
-    atom_t is_digit
-    atom_t is_title
-    atom_t is_upper
+cdef class Token:
+    cdef readonly atom_t i
+    cdef readonly atom_t c
+    cdef readonly atom_t w
+    cdef readonly atom_t shape
+    cdef readonly atom_t pref
+    cdef readonly atom_t suff
+    cdef readonly atom_t oft_title
+    cdef readonly atom_t oft_upper
+    cdef readonly atom_t is_alpha
+    cdef readonly atom_t is_digit
+    cdef readonly atom_t is_title
+    cdef readonly atom_t is_upper

-    atom_t url
-    atom_t num
+    cdef readonly atom_t url
+    cdef readonly atom_t num

-    atom_t postype
-    atom_t pos
-    atom_t ner
+    cdef readonly atom_t postype
+    cdef readonly atom_t pos
+    cdef readonly atom_t ner


-cdef struct Slots:
-    Token P2
-    Token P1
-    Token N0
-    Token N1
-    Token N2
+cdef class Slots:
+    cdef readonly Token P2
+    cdef readonly Token P1
+    cdef readonly Token N0
+    cdef readonly Token N1
+    cdef readonly Token N2


-cdef Slots FIELD_IDS
 cdef int N_FIELDS


-cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0
+cdef hash_t fill_slots(Slots s, int i, Tokens tokens) except 0

-cdef int fill_flat(atom_t* context, Slots* s) except -1
+cdef int fill_flat(atom_t* context, Slots s) except -1
+
+
+cpdef Slots FIELD_IDS
--- a/spacy/context.pyx
+++ b/spacy/context.pyx
@ -2,7 +2,16 @@ from murmurhash.mrmr cimport hash64
 from .lexeme cimport *


-cdef void _number_token(Token* t, int* n_fields):
+cdef class Slots:
+    def __init__(self):
+        self.P2 = Token()
+        self.P1 = Token()
+        self.N0 = Token()
+        self.N1 = Token()
+        self.N2 = Token()
+
+
+cdef void _number_token(Token t, int* n_fields):
    cdef int i = n_fields[0]
    t.i = i; i += 1
    t.c = i; i += 1
@ -27,7 +36,7 @@ cdef void _number_token(Token* t, int* n_fields):
    n_fields[0] = i


-cdef int fill_token(Token* t, Lexeme* lex, atom_t pos, atom_t ner):
+cdef int fill_token(Token t, Lexeme* lex, atom_t pos, atom_t ner):
    t.i = lex.sic
    t.c = lex.cluster
    t.w = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
@ -48,7 +57,7 @@ cdef int fill_token(Token* t, Lexeme* lex, atom_t pos, atom_t ner):
    t.ner = ner


-cdef int _flatten_token(atom_t* context, Token* ids, Token* vals) except -1:
+cdef int _flatten_token(atom_t* context, Token ids, Token vals) except -1:
    context[ids.i] = vals.i
    context[ids.c] = vals.c
    context[ids.w] = vals.w
@ -68,26 +77,27 @@ cdef int _flatten_token(atom_t* context, Token* ids, Token* vals) except -1:
    context[ids.ner] = vals.ner


-cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0:
-    fill_token(&s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
-    fill_token(&s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
-    fill_token(&s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
-    fill_token(&s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
-    fill_token(&s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
-    return hash64(s, sizeof(Slots), 0)
+cdef hash_t fill_slots(Slots s, int i, Tokens tokens) except 0:
+    fill_token(s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
+    fill_token(s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
+    fill_token(s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
+    fill_token(s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
+    fill_token(s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
+    return 1


-cdef int fill_flat(atom_t* context, Slots* s) except -1:
-    _flatten_token(context, &FIELD_IDS.P2, &s.P2)
-    _flatten_token(context, &FIELD_IDS.P1, &s.P1)
-    _flatten_token(context, &FIELD_IDS.N0, &s.N0)
-    _flatten_token(context, &FIELD_IDS.N1, &s.N1)
-    _flatten_token(context, &FIELD_IDS.N2, &s.N2)
+cdef int fill_flat(atom_t* context, Slots s) except -1:
+    _flatten_token(context, FIELD_IDS.P2, s.P2)
+    _flatten_token(context, FIELD_IDS.P1, s.P1)
+    _flatten_token(context, FIELD_IDS.N0, s.N0)
+    _flatten_token(context, FIELD_IDS.N1, s.N1)
+    _flatten_token(context, FIELD_IDS.N2, s.N2)


 N_FIELDS = 0
-_number_token(&FIELD_IDS.P2, &N_FIELDS)
-_number_token(&FIELD_IDS.P1, &N_FIELDS)
-_number_token(&FIELD_IDS.N0, &N_FIELDS)
-_number_token(&FIELD_IDS.N1, &N_FIELDS)
-_number_token(&FIELD_IDS.N2, &N_FIELDS)
+FIELD_IDS = Slots()
+_number_token(FIELD_IDS.P2, &N_FIELDS)
+_number_token(FIELD_IDS.P1, &N_FIELDS)
+_number_token(FIELD_IDS.N0, &N_FIELDS)
+_number_token(FIELD_IDS.N1, &N_FIELDS)
+_number_token(FIELD_IDS.N2, &N_FIELDS)
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -42,6 +42,7 @@ cdef class Language:
    cpdef readonly Lexicon lexicon

    cpdef readonly Tagger pos_tagger
+    cpdef readonly Tagger ner_tagger

    cdef object _prefix_re
    cdef object _suffix_re
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -45,6 +45,8 @@ cdef class Language:
            self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
        else:
            self.pos_tagger = None
+        if path.exists(path.join(util.DATA_DIR, name, 'ner')):
+            self.ner_tagger = Tagger(path.join(util.DATA_DIR, name, 'ner'))

    cpdef Tokens tokenize(self, unicode string):
        """Tokenize a string.
--- a/spacy/ner_feats.pxd
+++ b/spacy/ner_feats.pxd
--- a/spacy/ner_feats.pyx
+++ b/spacy/ner_feats.pyx
@ -0,0 +1,35 @@
+from spacy.context cimport FIELD_IDS, Token
+
+
+cdef Token P2 = FIELD_IDS.P2
+cdef Token P1 = FIELD_IDS.P1
+cdef Token N0 = FIELD_IDS.N0
+cdef Token N1 = FIELD_IDS.N1
+cdef Token N2 = FIELD_IDS.N2
+
+
+TEMPLATES = (
+    (N0.i,),
+    (N0.c,),
+
+    (P1.pos,),
+    (P1.i,),
+
+    (N1.w,),
+    (N1.pos,),
+
+    (P1.ner,),
+    (P2.ner,),
+
+    (N0.c,),
+    (P1.c,),
+    (N1.c,),
+
+    (N0.is_alpha,),
+    (N0.is_digit,),
+    (N0.is_title,),
+    (N0.is_upper,),
+
+    (N0.is_title, N0.oft_title),
+    (N0.is_upper, N0.oft_upper),
+)
--- a/spacy/pos.pyx
+++ b/spacy/pos.pyx
@ -1,229 +0,0 @@
-# cython: profile=True
-from os import path
-import os
-import shutil
-import ujson
-import random
-import codecs
-import gzip
-import cython
-
-from libc.stdint cimport uint32_t
-
-
-from thinc.weights cimport arg_max
-from thinc.features import NonZeroConjFeat
-from thinc.features import ConjFeat
-
-from .lexeme cimport *
-from .lang cimport Lexicon
-
-
-NULL_TAG = 0
-
-
-cdef class Tagger:
-    tags = {'NULL': NULL_TAG}
-    def __init__(self, model_dir):
-        self.mem = Pool()
-        tags_loc = path.join(model_dir, 'postags.json')
-        if path.exists(tags_loc):
-            with open(tags_loc) as file_:
-                Tagger.tags.update(ujson.load(file_))
-        self.model = LinearModel(len(self.tags))
-        if path.exists(path.join(model_dir, 'model')):
-            self.model.load(path.join(model_dir, 'model'))
-        self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])
-        self._atoms = <atom_t*>self.mem.alloc(CONTEXT_SIZE, sizeof(atom_t))
-        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
-        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
-        self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))
-        self._guess = NULL_TAG
-
-    cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
-        get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
-                  tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev)
-        self.extractor.extract(self._feats, self._values, self._atoms, NULL)
-        self._guess = self.model.score(self._scores, self._feats, self._values)
-        return self._guess
-
-    cpdef bint tell_answer(self, class_t gold) except *:
-        cdef class_t guess = self._guess
-        if gold == guess or gold == NULL_TAG:
-            self.model.update({})
-            return 0
-        counts = {guess: {}, gold: {}}
-        self.extractor.count(counts[gold], self._feats, 1)
-        self.extractor.count(counts[guess], self._feats, -1)
-        self.model.update(counts)
-
-    @classmethod
-    def encode_pos(cls, tag):
-        if tag not in cls.tags:
-            cls.tags[tag] = len(cls.tags)
-        return cls.tags[tag]
-
-
-@cython.boundscheck(False)
-def count_tags(Tagger tagger, Tokens tokens, uint32_t[:, :] tag_counts):
-    cdef class_t prev_prev, prev, tag
-    prev = tagger.tags['EOL']; prev_prev = tagger.tags['EOL']
-    cdef int i
-    cdef id_t token
-    for i in range(tokens.length):
-        tag = tagger.predict(i, tokens, prev, prev_prev)
-        prev_prev = prev
-        prev = tag
-        token = tokens.lex[i].id
-        if token < tag_counts.shape[0]:
-            tag_counts[token, tag] += 1
-
-
-cpdef enum:
-    P2i
-    P2c
-    P2w
-    P2shape
-    P2pref
-    P2suff
-    P2title
-    P2upper
-    P2oft_title
-    P2oft_upper
-    P2pos
-    P2url
-    P2num
-
-    P1i
-    P1c
-    P1w
-    P1shape
-    P1pre
-    P1suff
-    P1title
-    P1upper
-    P1oft_title
-    P1oft_upper
-    P1pos
-    P1url
-    P1num
-
-    N0i
-    N0c
-    N0w
-    N0shape
-    N0pref
-    N0suff
-    N0title
-    N0upper
-    N0oft_title
-    N0oft_upper
-    N0pos
-    N0url
-    N0num
-
-    N1i
-    N1c
-    N1w
-    N1shape
-    N1pref
-    N1suff
-    N1title
-    N1upper
-    N1oft_title
-    N1oft_upper
-    N1pos
-    N1url
-    N1num
-
-    N2i
-    N2c
-    N2w
-    N2shape
-    N2pref
-    N2suff
-    N2title
-    N2upper
-    N2oft_title
-    N2oft_upper
-    N2pos
-    N2url
-    N2num
-
-    P2t
-    P1t
-
-    CONTEXT_SIZE
-
-
-cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,
-                   Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
-    _fill_token(&atoms[P2i], p2)
-    _fill_token(&atoms[P1i], p1)
-    _fill_token(&atoms[N0i], n0)
-    _fill_token(&atoms[N1i], n1)
-    _fill_token(&atoms[N2i], n2)
-    atoms[P1t] = prev_tag
-    atoms[P2t] = prev_prev_tag
-
-
-cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
-    atoms[0] = lex.sic
-    atoms[1] = lex.cluster
-    atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
-    atoms[3] = lex.shape
-    atoms[4] = lex.prefix
-    atoms[5] = lex.suffix
-
-    atoms[6] = lex.flags & (1 << IS_TITLE)
-    atoms[7] = lex.flags & (1 << IS_UPPER)
-    atoms[8] = lex.flags & (1 << OFT_TITLE)
-    atoms[9] = lex.flags & (1 << OFT_UPPER)
-    atoms[10] = lex.postype
-    atoms[11] = lex.flags & (1 << LIKE_URL)
-    atoms[12] = lex.flags & (1 << LIKE_NUMBER)
-
-TEMPLATES = (
-    (N0i,),
-    (N0w,),
-    (N0suff,),
-    (N0pref,),
-    (P1t,),
-    (P2t,),
-    (P1t, P2t),
-    (P1t, N0w),
-    (P1w,),
-    (P1suff,),
-    (P2w,),
-    (N1w,),
-    (N1suff,),
-    (N2w,),
-
-    (N0shape,),
-    (N0c,),
-    (N1c,),
-    (N2c,),
-    (P1c,),
-    (P2c,),
-    (P1c, N0c),
-    (N0c, N1c),
-    (P1c, P1t),
-    (P1c, P1t, N0c),
-    (P1t, N0c),
-    (N0oft_upper,),
-    (N0oft_title,),
-
-    (P1w, N0w),
-    (N0w, N1w),
-
-    (N0pos,),
-    (P1t, N0pos, N1pos),
-    (P1t, N1pos),
-
-    (N0url,),
-    (N0num,),
-    (P1url,),
-    (P1url,),
-    (N1num,),
-    (N1url,),
-)
--- a/spacy/pos_feats.pxd
+++ b/spacy/pos_feats.pxd
@ -1,83 +0,0 @@
-from .tokens cimport Tokens
-from thinc.typedefs cimport atom_t
-
-
-cpdef enum:
-    P2i
-    P2c
-    P2w
-    P2shape
-    P2pref
-    P2suff
-    P2title
-    P2upper
-    P2oft_title
-    P2oft_upper
-    P2pos
-    P2url
-    P2num
-
-    P1i
-    P1c
-    P1w
-    P1shape
-    P1pre
-    P1suff
-    P1title
-    P1upper
-    P1oft_title
-    P1oft_upper
-    P1pos
-    P1url
-    P1num
-
-    N0i
-    N0c
-    N0w
-    N0shape
-    N0pref
-    N0suff
-    N0title
-    N0upper
-    N0oft_title
-    N0oft_upper
-    N0pos
-    N0url
-    N0num
-
-    N1i
-    N1c
-    N1w
-    N1shape
-    N1pref
-    N1suff
-    N1title
-    N1upper
-    N1oft_title
-    N1oft_upper
-    N1pos
-    N1url
-    N1num
-
-    N2i
-    N2c
-    N2w
-    N2shape
-    N2pref
-    N2suff
-    N2title
-    N2upper
-    N2oft_title
-    N2oft_upper
-    N2pos
-    N2url
-    N2num
-
-    P2t
-    P1t
-
-    CONTEXT_SIZE
-
-
-
-cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1
--- a/spacy/pos_feats.pyx
+++ b/spacy/pos_feats.pyx
@ -1,77 +1,41 @@
-from .lexeme cimport *
+from spacy.context cimport FIELD_IDS, Token

-from thinc.typedefs cimport atom_t
+
+cpdef Token P2 = FIELD_IDS.P2
+cpdef Token P1 = FIELD_IDS.P1
+cpdef Token N0 = FIELD_IDS.N0
+cpdef Token N1 = FIELD_IDS.N1
+cpdef Token N2 = FIELD_IDS.N2


 TEMPLATES = (
-    (N0i,),
-    (N0w,),
-    (N0suff,),
-    (N0pref,),
-    (P1t,),
-    (P2t,),
-    (P1t, P2t),
-    (P1t, N0w),
-    (P1w,),
-    (P1suff,),
-    (P2w,),
-    (N1w,),
-    (N1suff,),
-    (N2w,),
+    (N0.i,),
+    (N0.w,),
+    (N0.suff,),
+    (N0.pref,),
+    (P1.pos,),
+    (P2.pos,),
+    (P1.pos, P2.pos),
+    (P1.pos, N0.w),
+    (P1.w,),
+    (P1.suff,),
+    (P2.w,),
+    (N1.w,),
+    (N1.suff,),
+    (N2.w,),

-    (N0shape,),
-    (N0c,),
-    (N1c,),
-    (N2c,),
-    (P1c,),
-    (P2c,),
-    (P1c, N0c),
-    (N0c, N1c),
-    (P1c, P1t),
-    (P1c, P1t, N0c),
-    (P1t, N0c),
-    (N0oft_upper,),
-    (N0oft_title,),
+    (N0.shape,),
+    (N0.c,),
+    (N1.c,),
+    (N2.c,),
+    (P1.c,),
+    (P2.c,),
+    (N0.oft_upper,),
+    (N0.oft_title,),

-    (P1w, N0w),
-    (N0w, N1w),
+    (N0.postype,),

-    (N0pos,),
-    (P1t, N0pos, N1pos),
-    (P1t, N1pos),
-
-    (N0url,),
-    (N0num,),
-    (P1url,),
-    (P1url,),
-    (N1num,),
-    (N1url,),
+    (P1.url,),
+    (N1.num,),
+    (N1.url,),
 )
-
-
-cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
-    _fill_token(&context[P2i], tokens.lex[i-2])
-    _fill_token(&context[P1i], tokens.lex[i-1])
-    _fill_token(&context[N0i], tokens.lex[i])
-    _fill_token(&context[N1i], tokens.lex[i+1])
-    _fill_token(&context[N2i], tokens.lex[i+2])
-    context[P1t] = tokens.pos[i-1]
-    context[P2t] = tokens.pos[i-2]
-
-
-cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
-    atoms[0] = lex.sic
-    atoms[1] = lex.cluster
-    atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
-    atoms[3] = lex.shape
-    atoms[4] = lex.prefix
-    atoms[5] = lex.suffix
-
-    atoms[6] = lex.flags & (1 << IS_TITLE)
-    atoms[7] = lex.flags & (1 << IS_UPPER)
-    atoms[8] = lex.flags & (1 << OFT_TITLE)
-    atoms[9] = lex.flags & (1 << OFT_UPPER)
-    atoms[10] = lex.postype
-    atoms[11] = lex.flags & (1 << LIKE_URL)
-    atoms[12] = lex.flags & (1 << LIKE_NUMBER)
-
--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@ -3,10 +3,8 @@ from . import util
 from . import tokens
 from .en import EN

-from .pos import Tagger

-
-def read_gold(file_, tag_list):
+def read_gold(file_, tag_list, col):
    paras = file_.read().strip().split('\n\n')
    golds = []
    tag_ids = dict((tag, i) for i, tag in enumerate(tag_list))
@ -21,7 +19,7 @@ def read_gold(file_, tag_list):
        conll_toks = []
        for line in lines:
            pieces = line.split()
-            conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[3]))
+            conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[col]))
        for i, token in enumerate(tokens):
            if not conll_toks:
                tags.append('NULL')
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -4,6 +4,8 @@ from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t

+from .typedefs cimport hash_t
+from .context cimport Slots
 from .tokens cimport Tokens


@ -26,7 +28,8 @@ cdef class Tagger:
    cpdef readonly list tag_names

    cdef class_t _guess
-    cdef atom_t* _context
+    cdef atom_t* _context_flat
+    cdef Slots _context_slots
    cdef feat_t* _feats
    cdef weight_t* _values
    cdef weight_t* _scores
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -10,8 +10,9 @@ import random
 import json
 import cython

-from .pos_feats cimport fill_context as pos_fill_context
-from .pos_feats cimport CONTEXT_SIZE as POS_CONTEXT_SIZE
+from .context cimport fill_slots
+from .context cimport fill_flat
+from .context cimport N_FIELDS

 from thinc.features cimport ConjFeat

@ -46,6 +47,7 @@ def train(train_sents, model_dir, nr_iter=5):
                if gold != NULL_TAG:
                    total += 1
                    n_corr += guess == gold
+                #print('%s\t%d\t%d' % (tokens[i].string, guess, gold))
        print('%.4f' % ((n_corr / total) * 100))
        random.shuffle(train_sents)
    tagger.model.end_training()
@ -76,15 +78,12 @@ cdef class Tagger:
        self.tag_names = cfg['tag_names']
        self.tag_type = cfg['tag_type']
        self.extractor = Extractor(templates, [ConjFeat] * len(templates))
-        self.model = LinearModel(len(self.tag_names), self.extractor.n)
-        print("Load tagger model")
+        self.model = LinearModel(len(self.tag_names))
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))
-        print("Done")

-        if self.tag_type == POS:
-            n_context = POS_CONTEXT_SIZE
-        self._context = <atom_t*>self.mem.alloc(n_context, sizeof(atom_t))
+        self._context_flat = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
+        self._context_slots = Slots()
        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
        self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
@ -110,11 +109,9 @@ cdef class Tagger:
        >>> tag = EN.pos_tagger.predict(0, tokens)
        >>> assert tag == EN.pos_tagger.tag_id('DT') == 5
        """
-        if self.tag_type == POS:
-            pos_fill_context(self._context, i, tokens)
-        else:
-            raise StandardError
-        self.extractor.extract(self._feats, self._values, self._context, NULL)
+        cdef hash_t hashed = fill_slots(self._context_slots, i, tokens)
+        fill_flat(self._context_flat, self._context_slots)
+        self.extractor.extract(self._feats, self._values, self._context_flat, NULL)
        self._guess = self.model.score(self._scores, self._feats, self._values)
        return self._guess

--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -15,9 +15,11 @@ cdef class Tokens:
    cdef Lexeme** _lex_ptr
    cdef int* _idx_ptr
    cdef int* _pos_ptr
+    cdef int* _ner_ptr
    cdef Lexeme** lex
    cdef int* idx
    cdef int* pos
+    cdef int* ner

    cdef int length
    cdef int max_length
@ -32,6 +34,7 @@ cdef class Token:
    cdef public int i
    cdef public int idx
    cdef public int pos
+    cdef public int ner

    cdef public atom_t id
    cdef public atom_t cluster
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -1,6 +1,7 @@
 # cython: profile=True
 from .lexeme cimport *
 cimport cython
+from .tagger cimport POS, ENTITY

 DEF PADDING = 5

@ -44,21 +45,25 @@ cdef class Tokens:
        self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
        self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
        self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
+        self._ner_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
        self.lex = self._lex_ptr
        self.idx = self._idx_ptr
        self.pos = self._pos_ptr
+        self.ner = self._ner_ptr
        cdef int i
        for i in range(size + (PADDING*2)):
            self.lex[i] = &EMPTY_LEXEME
        self.lex += PADDING
        self.idx += PADDING
        self.pos += PADDING
+        self.ner += PADDING
        self.max_length = size
        self.length = 0

    def __getitem__(self, i):
        bounds_check(i, self.length, PADDING)
-        return Token(self._string_store, i, self.idx[i], self.pos[i], self.lex[i][0])
+        return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i],
+                     self.lex[i][0])

    def __iter__(self):
        for i in range(self.length):
@ -73,6 +78,7 @@ cdef class Tokens:
        self.lex[self.length] = lexeme
        self.idx[self.length] = idx
        self.pos[self.length] = 0
+        self.ner[self.length] = 0
        self.length += 1
        return idx + lexeme.length

@ -91,7 +97,10 @@ cdef class Tokens:
        return idx

    cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
-        self.pos[i] = tag
+        if tag_type == POS:
+            self.pos[i] = tag
+        elif tag_type == ENTITY:
+            self.ner[i] = tag

    def _realloc(self, new_size):
        self.max_length = new_size
@ -99,19 +108,23 @@ cdef class Tokens:
        self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
        self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
        self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
+        self._ner_ptr = <int*>self.mem.realloc(self._ner_ptr, n * sizeof(int))
        self.lex = self._lex_ptr + PADDING
        self.idx = self._idx_ptr + PADDING
        self.pos = self._pos_ptr + PADDING
+        self.ner = self._ner_ptr + PADDING
        for i in range(self.length, self.max_length + PADDING):
            self.lex[i] = &EMPTY_LEXEME


@cython.freelist(64)
 cdef class Token:
-    def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex):
+    def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
+                 dict lex):
        self._string_store = string_store
        self.idx = idx
        self.pos = pos
+        self.ner = ner
        self.i = i
        self.id = lex['id']