* Add experimental supersense features for parsing, based on lookup into wordnet.

This commit is contained in:
Matthew Honnibal 2015-07-01 20:12:44 +02:00
parent e6d828a9af
commit 52fd80c6c6
2 changed files with 44 additions and 1 deletions

View File

@ -34,6 +34,7 @@ cpdef enum:
S2_shape
S2_ne_iob
S2_ne_type
S2ss
S1w
S1W
@ -47,6 +48,7 @@ cpdef enum:
S1_shape
S1_ne_iob
S1_ne_type
S1ss
S1rw
S1rW
@ -60,6 +62,7 @@ cpdef enum:
S1r_shape
S1r_ne_iob
S1r_ne_type
S1rss
S0lw
S0lW
@ -73,6 +76,7 @@ cpdef enum:
S0l_shape
S0l_ne_iob
S0l_ne_type
S0lss
S0l2w
S0l2W
@ -86,6 +90,7 @@ cpdef enum:
S0l2_shape
S0l2_ne_iob
S0l2_ne_type
S0l2ss
S0w
S0W
@ -99,6 +104,7 @@ cpdef enum:
S0_shape
S0_ne_iob
S0_ne_type
S0ss
S0r2w
S0r2W
@ -112,6 +118,7 @@ cpdef enum:
S0r2_shape
S0r2_ne_iob
S0r2_ne_type
S0r2ss
S0rw
S0rW
@ -125,6 +132,7 @@ cpdef enum:
S0r_shape
S0r_ne_iob
S0r_ne_type
S0rss
N0l2w
N0l2W
@ -138,6 +146,7 @@ cpdef enum:
N0l2_shape
N0l2_ne_iob
N0l2_ne_type
N0l2ss
N0lw
N0lW
@ -151,6 +160,7 @@ cpdef enum:
N0l_shape
N0l_ne_iob
N0l_ne_type
N0lss
N0w
N0W
@ -164,6 +174,7 @@ cpdef enum:
N0_shape
N0_ne_iob
N0_ne_type
N0ss
N1w
N1W
@ -177,6 +188,7 @@ cpdef enum:
N1_shape
N1_ne_iob
N1_ne_type
N1ss
N2w
N2W
@ -190,6 +202,7 @@ cpdef enum:
N2_shape
N2_ne_iob
N2_ne_type
N2ss
P1w
P1W
@ -203,6 +216,7 @@ cpdef enum:
P1_shape
P1_ne_iob
P1_ne_type
P1ss
P2w
P2W
@ -216,6 +230,7 @@ cpdef enum:
P2_shape
P2_ne_iob
P2_ne_type
P2ss
E0w
E0W
@ -229,6 +244,7 @@ cpdef enum:
E0_shape
E0_ne_iob
E0_ne_type
E0ss
E1w
E1W
@ -242,6 +258,7 @@ cpdef enum:
E1_shape
E1_ne_iob
E1_ne_type
E1ss
# Misc features at the end
dist

View File

@ -18,6 +18,8 @@ from .stateclass cimport StateClass
from cymem.cymem cimport Pool
from ..cimport senses
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
if token is NULL:
@ -33,6 +35,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
context[9] = 0
context[10] = 0
context[11] = 0
context[12] = 0
else:
context[0] = token.lex.orth
context[1] = token.lemma
@ -58,6 +61,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
context[9] = token.lex.shape
context[10] = token.ent_iob
context[11] = token.ent_type
context[12] = token.lex.senses & senses.POS_SENSES[<int>token.pos]
cdef int fill_context(atom_t* ctxt, StateClass st) nogil:
# Take care to fill every element of context!
@ -250,6 +254,22 @@ unigrams = (
(N0lW, N0lp),
(N0lc6, N0lp),
(N0lL,),
(S2ss,),
(S1ss,),
(S1rss,),
(S0lss,),
(S0l2ss,),
(S0ss,),
(S0r2ss,),
(S0rss,),
(N0lss,),
(N0l2ss,),
(N0ss,),
(N1ss,),
(N2ss,),
(P1ss,),
(P2ss,),
)
@ -276,6 +296,7 @@ s0_n0 = (
(S0p, N0lv, N0p),
(S0c6, S0rL, S0r2L, N0p),
(S0p, N0lL, N0l2L, N0p),
(S0ss, N0ss),
)
@ -296,6 +317,7 @@ s1_s0 = (
(S1L, S0L, S0p),
(S1p, S1L, S0L, S0p),
(S1p, S0p),
(S1ss, S0ss),
)
@ -309,7 +331,8 @@ s1_n0 = (
(S1c6, S1p, N0c6, N0p),
(S1L, N0p),
(S1p, S1rL, N0p),
(S1p, S1rp, N0p)
(S1p, S1rp, N0p),
(S1ss, N0ss),
)
@ -323,6 +346,7 @@ s0_n1 = (
(S0c6, S0p, N1c6, N1p),
(S0L, N1p),
(S0p, S0rL, N1p),
(S0ss, N1ss),
)
@ -334,6 +358,7 @@ n0_n1 = (
(N0c6, N0p, N1c6, N1p),
(N0c, N1c),
(N0p, N1c),
(N0ss, N1ss),
)
tree_shape = (
@ -361,6 +386,7 @@ trigrams = (
(N0W, N0p, N0lL, N0l2L),
(N0p, N0lL, N0l2L),
(S1ss, S0ss, N0ss,),
)