spaCy/spacy/syntax/_parse_features.pyx

"""
Fill an array, context, with every _atomic_ value our features reference.
We then write the _actual features_ as tuples of the atoms. The machinery
that translates from the tuples to feature-extractors (which pick the values
out of "context") is in features/extractor.pyx

The atomic feature names are listed in a big enum, so that the feature tuples
can refer to them.
"""
from libc.string cimport memset

from itertools import combinations

from ..tokens cimport TokenC
from ._state cimport State
from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2
from ._state cimport get_p2, get_p1
from ._state cimport get_e0, get_e1
from ._state cimport has_head, get_left, get_right
from ._state cimport count_left_kids, count_right_kids


from .stateclass cimport StateClass

from cymem.cymem cimport Pool


cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
    if token is NULL:
        context[0] = 0
        context[1] = 0
        context[2] = 0
        context[3] = 0
        context[4] = 0
        context[5] = 0
        context[6] = 0
        context[7] = 0
        context[8] = 0
        context[9] = 0
        context[10] = 0
        context[11] = 0
    else:
        context[0] = token.lex.orth
        context[1] = token.lemma
        context[2] = token.tag
        context[3] = token.lex.cluster
        # We've read in the string little-endian, so now we can take & (2**n)-1
        # to get the first n bits of the cluster.
        # e.g. s = "1110010101"
        # s = ''.join(reversed(s))
        # first_4_bits = int(s, 2)
        # print first_4_bits
        # 5
        # print "{0:b}".format(prefix).ljust(4, '0')
        # 1110
        # What we're doing here is picking a number where all bits are 1, e.g.
        # 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in
        # the source that are set to 1.
        context[4] = token.lex.cluster & 15
        context[5] = token.lex.cluster & 63
        context[6] = token.dep if has_head(token) else 0
        context[7] = token.lex.prefix
        context[8] = token.lex.suffix
        context[9] = token.lex.shape
        context[10] = token.ent_iob
        context[11] = token.ent_type

cdef int _new_fill_context(atom_t* ctxt, State* state) except -1:
    # Take care to fill every element of context!
    # We could memset, but this makes it very easy to have broken features that
    # make almost no impact on accuracy. If instead they're unset, the impact
    # tends to be dramatic, so we get an obvious regression to fix...
    cdef StateClass st = StateClass(state.sent_len)
    st.from_struct(state)
    fill_token(&ctxt[S2w], st.S_(2))
    fill_token(&ctxt[S1w], st.S_(1))
    fill_token(&ctxt[S1rw], st.R_(st.S(1), 1))
    fill_token(&ctxt[S0lw], st.L_(st.S(0), 1))
    fill_token(&ctxt[S0l2w], st.L_(st.S(0), 2))
    fill_token(&ctxt[S0w], st.S_(0))
    fill_token(&ctxt[S0r2w], st.R_(st.S(0), 2))
    fill_token(&ctxt[S0rw], st.R_(st.S(0), 1))
    fill_token(&ctxt[N0lw], st.L_(st.B(0), 1))
    fill_token(&ctxt[N0l2w], st.L_(st.B(0), 2))
    fill_token(&ctxt[N0w], st.B_(0))
    fill_token(&ctxt[N1w], st.B_(1))
    fill_token(&ctxt[N2w], st.B_(2))
    fill_token(&ctxt[P1w], st.safe_get(st.B(0)-1))
    fill_token(&ctxt[P2w], st.safe_get(st.B(0)-2))

    # TODO
    fill_token(&ctxt[E0w], get_e0(state))
    fill_token(&ctxt[E1w], get_e1(state))

    if st.stack_depth() >= 1 and not st.eol():
        ctxt[dist] = min(st.S(0) - st.B(0), 5) # TODO: This is backwards!!
    else:
        ctxt[dist] = 0
    ctxt[N0lv] = min(st.n_L(st.B(0)), 5)
    ctxt[S0lv] = min(st.n_L(st.S(0)), 5)
    ctxt[S0rv] = min(st.n_R(st.S(0)), 5)
    ctxt[S1lv] = min(st.n_L(st.S(1)), 5)
    ctxt[S1rv] = min(st.n_R(st.S(1)), 5)

    ctxt[S0_has_head] = 0
    ctxt[S1_has_head] = 0
    ctxt[S2_has_head] = 0
    if st.stack_depth() >= 1:
        ctxt[S0_has_head] = st.has_head(st.S(0)) + 1
        if st.stack_depth() >= 2:
            ctxt[S1_has_head] = st.has_head(st.S(1)) + 1
            if st.stack_depth() >= 3:
                ctxt[S2_has_head] = st.has_head(st.S(2)) + 1


cdef int fill_context(atom_t* context, State* state) except -1:
    # Take care to fill every element of context!
    # We could memset, but this makes it very easy to have broken features that
    # make almost no impact on accuracy. If instead they're unset, the impact
    # tends to be dramatic, so we get an obvious regression to fix...
    fill_token(&context[S2w], get_s2(state))
    fill_token(&context[S1w], get_s1(state))
    fill_token(&context[S1rw], get_right(state, get_s1(state), 1))
    fill_token(&context[S0lw], get_left(state, get_s0(state), 1))
    fill_token(&context[S0l2w], get_left(state, get_s0(state), 2))
    fill_token(&context[S0w], get_s0(state))
    fill_token(&context[S0r2w], get_right(state, get_s0(state), 2))
    fill_token(&context[S0rw], get_right(state, get_s0(state), 1))
    fill_token(&context[N0lw], get_left(state, get_n0(state), 1))
    fill_token(&context[N0l2w], get_left(state, get_n0(state), 2))
    fill_token(&context[N0w], get_n0(state))
    fill_token(&context[N1w], get_n1(state))
    fill_token(&context[N2w], get_n2(state))
    fill_token(&context[P1w], get_p1(state))
    fill_token(&context[P2w], get_p2(state))

    fill_token(&context[E0w], get_e0(state))
    fill_token(&context[E1w], get_e1(state))
    if state.stack_len >= 1:
        context[dist] = min(state.stack[0] - state.i, 5)
    else:
        context[dist] = 0
    context[N0lv] = min(count_left_kids(get_n0(state)), 5)
    context[S0lv] = min(count_left_kids(get_s0(state)), 5)
    context[S0rv] = min(count_right_kids(get_s0(state)), 5)
    context[S1lv] = min(count_left_kids(get_s1(state)), 5)
    context[S1rv] = min(count_right_kids(get_s1(state)), 5)

    context[S0_has_head] = 0
    context[S1_has_head] = 0
    context[S2_has_head] = 0
    if state.stack_len >= 1:
        context[S0_has_head] = has_head(get_s0(state)) + 1
        if state.stack_len >= 2:
            context[S1_has_head] = has_head(get_s1(state)) + 1
            if state.stack_len >= 3:
                context[S2_has_head] = has_head(get_s2(state)) + 1


ner = (
    (N0W,),
    (P1W,),
    (N1W,),
    (P2W,),
    (N2W,),

    (P1W, N0W,),
    (N0W, N1W),

    (N0_prefix,),
    (N0_suffix,),

    (P1_shape,),
    (N0_shape,),
    (N1_shape,),
    (P1_shape, N0_shape,),
    (N0_shape, P1_shape,),
    (P1_shape, N0_shape, N1_shape),
    (N2_shape,),
    (P2_shape,),

    #(P2_norm, P1_norm, W_norm),
    #(P1_norm, W_norm, N1_norm),
    #(W_norm, N1_norm, N2_norm)

    (P2p,),
    (P1p,),
    (N0p,),
    (N1p,),
    (N2p,),

    (P1p, N0p),
    (N0p, N1p),
    (P2p, P1p, N0p),
    (P1p, N0p, N1p),
    (N0p, N1p, N2p),

    (P2c,),
    (P1c,),
    (N0c,),
    (N1c,),
    (N2c,),

    (P1c, N0c),
    (N0c, N1c),

    (E0W,),
    (E0c,),
    (E0p,),

    (E0W, N0W),
    (E0c, N0W),
    (E0p, N0W),

    (E0p, P1p, N0p),
    (E0c, P1c, N0c),

    (E0w, P1c),
    (E0p, P1p),
    (E0c, P1c),
    (E0p, E1p),
    (E0c, P1p),

    (E1W,),
    (E1c,),
    (E1p,),

    (E0W, E1W),
    (E0W, E1p,),
    (E0p, E1W,),
    (E0p, E1W),

    (P1_ne_iob,),
    (P1_ne_iob, P1_ne_type),
    (N0w, P1_ne_iob, P1_ne_type),

    (N0_shape,),
    (N1_shape,),
    (N2_shape,),
    (P1_shape,),
    (P2_shape,),

    (N0_prefix,),
    (N0_suffix,),

    (P1_ne_iob,),
    (P2_ne_iob,),
    (P1_ne_iob, P2_ne_iob),
    (P1_ne_iob, P1_ne_type),
    (P2_ne_iob, P2_ne_type),
    (N0w, P1_ne_iob, P1_ne_type),

    (N0w, N1w),
)


unigrams = (
    (S2W, S2p),
    (S2c6, S2p),

    (S1W, S1p),
    (S1c6, S1p),

    (S0W, S0p),
    (S0c6, S0p),

    (N0W, N0p),
    (N0p,),
    (N0c,),
    (N0c6, N0p),
    (N0L,),

    (N1W, N1p),
    (N1c6, N1p),

    (N2W, N2p),
    (N2c6, N2p),

    (S0r2W, S0r2p),
    (S0r2c6, S0r2p),
    (S0r2L,),

    (S0rW, S0rp),
    (S0rc6, S0rp),
    (S0rL,),

    (S0l2W, S0l2p),
    (S0l2c6, S0l2p),
    (S0l2L,),

    (S0lW, S0lp),
    (S0lc6, S0lp),
    (S0lL,),

    (N0l2W, N0l2p),
    (N0l2c6, N0l2p),
    (N0l2L,),

    (N0lW, N0lp),
    (N0lc6, N0lp),
    (N0lL,),
)


s0_n0 = (
    (S0W, S0p, N0W, N0p),
    (S0c, S0p, N0c, N0p),
    (S0c6, S0p, N0c6, N0p),
    (S0c4, S0p, N0c4, N0p),
    (S0p, N0p),
    (S0W, N0p),
    (S0p, N0W),
    (S0W, N0c),
    (S0c, N0W),
    (S0p, N0c),
    (S0c, N0p),
    (S0W, S0rp, N0p),
    (S0p, S0rp, N0p),
    (S0p, N0lp, N0W),
    (S0p, N0lp, N0p),
)


s1_n0 = (
    (S1p, N0p),
    (S1c, N0c),
    (S1c, N0p),
    (S1p, N0c),
    (S1W, S1p, N0p),
    (S1p, N0W, N0p),
    (S1c6, S1p, N0c6, N0p),
)


s0_n1 = (
    (S0p, N1p),
    (S0c, N1c),
    (S0c, N1p),
    (S0p, N1c),
    (S0W, S0p, N1p),
    (S0p, N1W, N1p),
    (S0c6, S0p, N1c6, N1p),
)


n0_n1 = (
    (N0W, N0p, N1W, N1p),
    (N0W, N0p, N1p),
    (N0p, N1W, N1p),
    (N0c, N0p, N1c, N1p),
    (N0c6, N0p, N1c6, N1p),
    (N0c, N1c),
    (N0p, N1c),
)

tree_shape = (
    (dist,),
    (S0p, S0_has_head, S1_has_head, S2_has_head),
    (S0p, S0lv, S0rv),
    (N0p, N0lv),
)

trigrams = (
    (N0p, N1p, N2p),
    (S0p, S0lp, S0l2p),
    (S0p, S0rp, S0r2p),
    (S0p, S1p, S2p),
    (S1p, S0p, N0p),
    (S0p, S0lp, N0p),
    (S0p, N0p, N0lp),
    (N0p, N0lp, N0l2p),

    (S0W, S0p, S0rL, S0r2L),
    (S0p, S0rL, S0r2L),

    (S0W, S0p, S0lL, S0l2L),
    (S0p, S0lL, S0l2L),

    (N0W, N0p, N0lL, N0l2L),
    (N0p, N0lL, N0l2L),
)
* Work on greedy parser 2014-12-16 14:44:43 +03:00			`"""`
			`Fill an array, context, with every _atomic_ value our features reference.`
			`We then write the _actual features_ as tuples of the atoms. The machinery`
			`that translates from the tuples to feature-extractors (which pick the values`
			`out of "context") is in features/extractor.pyx`

			`The atomic feature names are listed in a big enum, so that the feature tuples`
			`can refer to them.`
			`"""`
* Add comment to fill_context 2015-03-24 06:39:58 +03:00			`from libc.string cimport memset`

* Work on greedy parser 2014-12-16 14:44:43 +03:00			`from itertools import combinations`

			`from ..tokens cimport TokenC`
			`from ._state cimport State`
			`from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`from ._state cimport get_p2, get_p1`
* Improve features for NER 2015-03-11 04:26:13 +03:00			`from ._state cimport get_e0, get_e1`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`from ._state cimport has_head, get_left, get_right`
			`from ._state cimport count_left_kids, count_right_kids`
* Work on greedy parser 2014-12-16 14:44:43 +03:00

* Prepare to switch to using state class, instead of state struct 2015-06-09 22:20:14 +03:00			`from .stateclass cimport StateClass`

			`from cymem.cymem cimport Pool`


* Work on greedy parser 2014-12-16 14:44:43 +03:00			`cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:`
* Work on greedy parser 2014-12-16 19:19:43 +03:00			`if token is NULL:`
			`context[0] = 0`
			`context[1] = 0`
			`context[2] = 0`
			`context[3] = 0`
			`context[4] = 0`
			`context[5] = 0`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`context[6] = 0`
* Fix bug in prefix, suffix and word shape features in parser and NER 2015-04-10 04:53:33 +03:00			`context[7] = 0`
			`context[8] = 0`
			`context[9] = 0`
* Add ne_iob and ne_type attributes to context vector 2015-04-10 06:02:15 +03:00			`context[10] = 0`
			`context[11] = 0`
* Work on greedy parser 2014-12-16 19:19:43 +03:00			`else:`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`context[0] = token.lex.orth`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`context[1] = token.lemma`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`context[2] = token.tag`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`context[3] = token.lex.cluster`
* Work on greedy parser 2014-12-16 19:19:43 +03:00			`# We've read in the string little-endian, so now we can take & (2**n)-1`
			`# to get the first n bits of the cluster.`
			`# e.g. s = "1110010101"`
			`# s = ''.join(reversed(s))`
			`# first_4_bits = int(s, 2)`
			`# print first_4_bits`
			`# 5`
			`# print "{0:b}".format(prefix).ljust(4, '0')`
			`# 1110`
			`# What we're doing here is picking a number where all bits are 1, e.g.`
			`# 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in`
			`# the source that are set to 1.`
* Fix two bugs in feature calculation 2015-04-29 00:25:09 +03:00			`context[4] = token.lex.cluster & 15`
			`context[5] = token.lex.cluster & 63`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`context[6] = token.dep if has_head(token) else 0`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`context[7] = token.lex.prefix`
			`context[8] = token.lex.suffix`
			`context[9] = token.lex.shape`
* Add ne_iob and ne_type attributes to context vector 2015-04-10 06:02:15 +03:00			`context[10] = token.ent_iob`
			`context[11] = token.ent_type`
* Work on greedy parser 2014-12-16 14:44:43 +03:00
* Prepare to switch to using state class, instead of state struct 2015-06-09 22:20:14 +03:00			`cdef int _new_fill_context(atom_t* ctxt, State* state) except -1:`
			`# Take care to fill every element of context!`
			`# We could memset, but this makes it very easy to have broken features that`
			`# make almost no impact on accuracy. If instead they're unset, the impact`
			`# tends to be dramatic, so we get an obvious regression to fix...`
			`cdef StateClass st = StateClass(state.sent_len)`
			`st.from_struct(state)`
			`fill_token(&ctxt[S2w], st.S_(2))`
			`fill_token(&ctxt[S1w], st.S_(1))`
			`fill_token(&ctxt[S1rw], st.R_(st.S(1), 1))`
			`fill_token(&ctxt[S0lw], st.L_(st.S(0), 1))`
			`fill_token(&ctxt[S0l2w], st.L_(st.S(0), 2))`
			`fill_token(&ctxt[S0w], st.S_(0))`
			`fill_token(&ctxt[S0r2w], st.R_(st.S(0), 2))`
			`fill_token(&ctxt[S0rw], st.R_(st.S(0), 1))`
			`fill_token(&ctxt[N0lw], st.L_(st.B(0), 1))`
			`fill_token(&ctxt[N0l2w], st.L_(st.B(0), 2))`
			`fill_token(&ctxt[N0w], st.B_(0))`
			`fill_token(&ctxt[N1w], st.B_(1))`
			`fill_token(&ctxt[N2w], st.B_(2))`
			`fill_token(&ctxt[P1w], st.safe_get(st.B(0)-1))`
			`fill_token(&ctxt[P2w], st.safe_get(st.B(0)-2))`

			`# TODO`
			`fill_token(&ctxt[E0w], get_e0(state))`
			`fill_token(&ctxt[E1w], get_e1(state))`

			`if st.stack_depth() >= 1 and not st.eol():`
			`ctxt[dist] = min(st.S(0) - st.B(0), 5) # TODO: This is backwards!!`
			`else:`
			`ctxt[dist] = 0`
			`ctxt[N0lv] = min(st.n_L(st.B(0)), 5)`
			`ctxt[S0lv] = min(st.n_L(st.S(0)), 5)`
			`ctxt[S0rv] = min(st.n_R(st.S(0)), 5)`
			`ctxt[S1lv] = min(st.n_L(st.S(1)), 5)`
			`ctxt[S1rv] = min(st.n_R(st.S(1)), 5)`

			`ctxt[S0_has_head] = 0`
			`ctxt[S1_has_head] = 0`
			`ctxt[S2_has_head] = 0`
			`if st.stack_depth() >= 1:`
			`ctxt[S0_has_head] = st.has_head(st.S(0)) + 1`
			`if st.stack_depth() >= 2:`
			`ctxt[S1_has_head] = st.has_head(st.S(1)) + 1`
			`if st.stack_depth() >= 3:`
			`ctxt[S2_has_head] = st.has_head(st.S(2)) + 1`

* Work on greedy parser 2014-12-16 14:44:43 +03:00
			`cdef int fill_context(atom_t* context, State* state) except -1:`
* Add comment to fill_context 2015-03-24 06:39:58 +03:00			`# Take care to fill every element of context!`
			`# We could memset, but this makes it very easy to have broken features that`
			`# make almost no impact on accuracy. If instead they're unset, the impact`
			`# tends to be dramatic, so we get an obvious regression to fix...`
* Work on greedy parser 2014-12-16 14:44:43 +03:00			`fill_token(&context[S2w], get_s2(state))`
			`fill_token(&context[S1w], get_s1(state))`
* Work on greedy parser 2014-12-16 19:19:43 +03:00			`fill_token(&context[S1rw], get_right(state, get_s1(state), 1))`
			`fill_token(&context[S0lw], get_left(state, get_s0(state), 1))`
			`fill_token(&context[S0l2w], get_left(state, get_s0(state), 2))`
* Work on greedy parser 2014-12-16 14:44:43 +03:00			`fill_token(&context[S0w], get_s0(state))`
* Work on greedy parser 2014-12-16 19:19:43 +03:00			`fill_token(&context[S0r2w], get_right(state, get_s0(state), 2))`
			`fill_token(&context[S0rw], get_right(state, get_s0(state), 1))`
* Work on greedy parser 2014-12-17 13:09:29 +03:00			`fill_token(&context[N0lw], get_left(state, get_n0(state), 1))`
			`fill_token(&context[N0l2w], get_left(state, get_n0(state), 2))`
* Work on greedy parser 2014-12-16 14:44:43 +03:00			`fill_token(&context[N0w], get_n0(state))`
* Work on greedy parser 2014-12-16 19:19:43 +03:00			`fill_token(&context[N1w], get_n1(state))`
* Important bug fix: Fill token N2w, which was being unfilled, after a bad edit while writing the NER features. 2015-03-24 06:32:11 +03:00			`fill_token(&context[N2w], get_n2(state))`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`fill_token(&context[P1w], get_p1(state))`
			`fill_token(&context[P2w], get_p2(state))`
* Work on greedy parser 2014-12-16 14:44:43 +03:00
* Improve features for NER 2015-03-11 04:26:13 +03:00			`fill_token(&context[E0w], get_e0(state))`
			`fill_token(&context[E1w], get_e1(state))`
* Work on greedy parser 2014-12-16 19:19:43 +03:00			`if state.stack_len >= 1:`
* Add length cap to distance feature 2015-05-31 06:25:30 +03:00			`context[dist] = min(state.stack[0] - state.i, 5)`
* Work on greedy parser 2014-12-16 19:19:43 +03:00			`else:`
			`context[dist] = 0`
* Fix valency features 2015-05-31 06:24:33 +03:00			`context[N0lv] = min(count_left_kids(get_n0(state)), 5)`
			`context[S0lv] = min(count_left_kids(get_s0(state)), 5)`
			`context[S0rv] = min(count_right_kids(get_s0(state)), 5)`
			`context[S1lv] = min(count_left_kids(get_s1(state)), 5)`
			`context[S1rv] = min(count_right_kids(get_s1(state)), 5)`
* Work on greedy parser 2014-12-16 14:44:43 +03:00
* Upd docstrings 2014-12-27 10:45:16 +03:00			`context[S0_has_head] = 0`
			`context[S1_has_head] = 0`
			`context[S2_has_head] = 0`
			`if state.stack_len >= 1:`
			`context[S0_has_head] = has_head(get_s0(state)) + 1`
			`if state.stack_len >= 2:`
			`context[S1_has_head] = has_head(get_s1(state)) + 1`
			`if state.stack_len >= 3:`
* Fix two bugs in feature calculation 2015-04-29 00:25:09 +03:00			`context[S2_has_head] = has_head(get_s2(state)) + 1`
* Upd docstrings 2014-12-27 10:45:16 +03:00
* Add ne_iob and ne_type features to NER 2015-04-10 20:07:08 +03:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`ner = (`
* Move Span class to own file 2015-03-26 05:19:07 +03:00			`(N0W,),`
			`(P1W,),`
			`(N1W,),`
			`(P2W,),`
			`(N2W,),`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* Move Span class to own file 2015-03-26 05:19:07 +03:00			`(P1W, N0W,),`
			`(N0W, N1W),`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`(N0_prefix,),`
			`(N0_suffix,),`

			`(P1_shape,),`
			`(N0_shape,),`
			`(N1_shape,),`
			`(P1_shape, N0_shape,),`
			`(N0_shape, P1_shape,),`
			`(P1_shape, N0_shape, N1_shape),`
			`(N2_shape,),`
			`(P2_shape,),`

			`#(P2_norm, P1_norm, W_norm),`
			`#(P1_norm, W_norm, N1_norm),`
			`#(W_norm, N1_norm, N2_norm)`

			`(P2p,),`
			`(P1p,),`
			`(N0p,),`
			`(N1p,),`
			`(N2p,),`

			`(P1p, N0p),`
			`(N0p, N1p),`
			`(P2p, P1p, N0p),`
			`(P1p, N0p, N1p),`
			`(N0p, N1p, N2p),`

			`(P2c,),`
			`(P1c,),`
			`(N0c,),`
			`(N1c,),`
			`(N2c,),`

			`(P1c, N0c),`
			`(N0c, N1c),`
* Improve features for NER 2015-03-11 04:26:13 +03:00
* Move Span class to own file 2015-03-26 05:19:07 +03:00			`(E0W,),`
* Improve features for NER 2015-03-11 04:26:13 +03:00			`(E0c,),`
			`(E0p,),`

* Move Span class to own file 2015-03-26 05:19:07 +03:00			`(E0W, N0W),`
			`(E0c, N0W),`
			`(E0p, N0W),`
* Improve features for NER 2015-03-11 04:26:13 +03:00
			`(E0p, P1p, N0p),`
			`(E0c, P1c, N0c),`

			`(E0w, P1c),`
			`(E0p, P1p),`
			`(E0c, P1c),`
			`(E0p, E1p),`
			`(E0c, P1p),`

* Move Span class to own file 2015-03-26 05:19:07 +03:00			`(E1W,),`
* Improve features for NER 2015-03-11 04:26:13 +03:00			`(E1c,),`
			`(E1p,),`

* Move Span class to own file 2015-03-26 05:19:07 +03:00			`(E0W, E1W),`
			`(E0W, E1p,),`
			`(E0p, E1W,),`
			`(E0p, E1W),`
* Add ne_iob and ne_type features to NER 2015-04-10 20:07:08 +03:00
			`(P1_ne_iob,),`
			`(P1_ne_iob, P1_ne_type),`
			`(N0w, P1_ne_iob, P1_ne_type),`

			`(N0_shape,),`
			`(N1_shape,),`
			`(N2_shape,),`
			`(P1_shape,),`
			`(P2_shape,),`

			`(N0_prefix,),`
			`(N0_suffix,),`

			`(P1_ne_iob,),`
			`(P2_ne_iob,),`
			`(P1_ne_iob, P2_ne_iob),`
			`(P1_ne_iob, P1_ne_type),`
			`(P2_ne_iob, P2_ne_type),`
			`(N0w, P1_ne_iob, P1_ne_type),`
* Add (N0w, N1w) unigram pair to NER features, prompted by failure to detect 'this weekend' 2015-04-15 07:01:18 +03:00
			`(N0w, N1w),`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`)`

* Merge train.py 2015-02-04 22:16:14 +03:00
* Upd docstrings 2014-12-27 10:45:16 +03:00			`unigrams = (`
			`(S2W, S2p),`
			`(S2c6, S2p),`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* Upd docstrings 2014-12-27 10:45:16 +03:00			`(S1W, S1p),`
			`(S1c6, S1p),`

			`(S0W, S0p),`
			`(S0c6, S0p),`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* Upd docstrings 2014-12-27 10:45:16 +03:00			`(N0W, N0p),`
			`(N0p,),`
			`(N0c,),`
			`(N0c6, N0p),`
			`(N0L,),`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* Upd docstrings 2014-12-27 10:45:16 +03:00			`(N1W, N1p),`
			`(N1c6, N1p),`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* Upd docstrings 2014-12-27 10:45:16 +03:00			`(N2W, N2p),`
			`(N2c6, N2p),`

			`(S0r2W, S0r2p),`
			`(S0r2c6, S0r2p),`
			`(S0r2L,),`

			`(S0rW, S0rp),`
			`(S0rc6, S0rp),`
			`(S0rL,),`

			`(S0l2W, S0l2p),`
			`(S0l2c6, S0l2p),`
			`(S0l2L,),`

			`(S0lW, S0lp),`
			`(S0lc6, S0lp),`
			`(S0lL,),`

			`(N0l2W, N0l2p),`
			`(N0l2c6, N0l2p),`
			`(N0l2L,),`

			`(N0lW, N0lp),`
			`(N0lc6, N0lp),`
			`(N0lL,),`
			`)`


			`s0_n0 = (`
			`(S0W, S0p, N0W, N0p),`
			`(S0c, S0p, N0c, N0p),`
			`(S0c6, S0p, N0c6, N0p),`
			`(S0c4, S0p, N0c4, N0p),`
			`(S0p, N0p),`
			`(S0W, N0p),`
			`(S0p, N0W),`
			`(S0W, N0c),`
			`(S0c, N0W),`
			`(S0p, N0c),`
			`(S0c, N0p),`
			`(S0W, S0rp, N0p),`
			`(S0p, S0rp, N0p),`
			`(S0p, N0lp, N0W),`
			`(S0p, N0lp, N0p),`
			`)`


			`s1_n0 = (`
			`(S1p, N0p),`
			`(S1c, N0c),`
			`(S1c, N0p),`
			`(S1p, N0c),`
			`(S1W, S1p, N0p),`
			`(S1p, N0W, N0p),`
			`(S1c6, S1p, N0c6, N0p),`
			`)`


			`s0_n1 = (`
			`(S0p, N1p),`
			`(S0c, N1c),`
			`(S0c, N1p),`
			`(S0p, N1c),`
			`(S0W, S0p, N1p),`
			`(S0p, N1W, N1p),`
			`(S0c6, S0p, N1c6, N1p),`
			`)`

* Fix two bugs in feature calculation 2015-04-29 00:25:09 +03:00
* Upd docstrings 2014-12-27 10:45:16 +03:00			`n0_n1 = (`
			`(N0W, N0p, N1W, N1p),`
			`(N0W, N0p, N1p),`
			`(N0p, N1W, N1p),`
			`(N0c, N0p, N1c, N1p),`
			`(N0c6, N0p, N1c6, N1p),`
			`(N0c, N1c),`
			`(N0p, N1c),`
			`)`

			`tree_shape = (`
			`(dist,),`
			`(S0p, S0_has_head, S1_has_head, S2_has_head),`
			`(S0p, S0lv, S0rv),`
			`(N0p, N0lv),`
			`)`

			`trigrams = (`
			`(N0p, N1p, N2p),`
			`(S0p, S0lp, S0l2p),`
			`(S0p, S0rp, S0r2p),`
			`(S0p, S1p, S2p),`
			`(S1p, S0p, N0p),`
			`(S0p, S0lp, N0p),`
			`(S0p, N0p, N0lp),`
			`(N0p, N0lp, N0l2p),`
* Whitespace 2015-04-29 15:22:47 +03:00
* Upd docstrings 2014-12-27 10:45:16 +03:00			`(S0W, S0p, S0rL, S0r2L),`
			`(S0p, S0rL, S0r2L),`

			`(S0W, S0p, S0lL, S0l2L),`
			`(S0p, S0lL, S0l2L),`

			`(N0W, N0p, N0lL, N0l2L),`
			`(N0p, N0lL, N0l2L),`
			`)`