* Add experimental supersense features for parsing, based on lookup into wordnet.

2026-01-09 18:21:14 +03:00 · 2015-07-01 20:12:44 +02:00 · 2015-07-01 20:12:44 +02:00 · 52fd80c6c6
commit 52fd80c6c6
parent e6d828a9af
2 changed files with 44 additions and 1 deletions
--- a/spacy/syntax/_parse_features.pxd
+++ b/spacy/syntax/_parse_features.pxd
@ -34,6 +34,7 @@ cpdef enum:
    S2_shape
    S2_ne_iob
    S2_ne_type
+    S2ss

    S1w
    S1W
@ -47,6 +48,7 @@ cpdef enum:
    S1_shape
    S1_ne_iob
    S1_ne_type
+    S1ss

    S1rw
    S1rW
@ -60,6 +62,7 @@ cpdef enum:
    S1r_shape
    S1r_ne_iob
    S1r_ne_type
+    S1rss

    S0lw
    S0lW
@ -73,6 +76,7 @@ cpdef enum:
    S0l_shape
    S0l_ne_iob
    S0l_ne_type
+    S0lss

    S0l2w
    S0l2W
@ -86,6 +90,7 @@ cpdef enum:
    S0l2_shape
    S0l2_ne_iob
    S0l2_ne_type
+    S0l2ss

    S0w
    S0W
@ -99,6 +104,7 @@ cpdef enum:
    S0_shape
    S0_ne_iob
    S0_ne_type
+    S0ss

    S0r2w
    S0r2W
@ -112,6 +118,7 @@ cpdef enum:
    S0r2_shape
    S0r2_ne_iob
    S0r2_ne_type
+    S0r2ss

    S0rw
    S0rW
@ -125,6 +132,7 @@ cpdef enum:
    S0r_shape
    S0r_ne_iob
    S0r_ne_type
+    S0rss

    N0l2w
    N0l2W
@ -138,6 +146,7 @@ cpdef enum:
    N0l2_shape
    N0l2_ne_iob
    N0l2_ne_type
+    N0l2ss

    N0lw
    N0lW
@ -151,6 +160,7 @@ cpdef enum:
    N0l_shape
    N0l_ne_iob
    N0l_ne_type
+    N0lss

    N0w
    N0W
@ -164,6 +174,7 @@ cpdef enum:
    N0_shape
    N0_ne_iob
    N0_ne_type
+    N0ss

    N1w
    N1W
@ -177,6 +188,7 @@ cpdef enum:
    N1_shape
    N1_ne_iob
    N1_ne_type
+    N1ss

    N2w
    N2W
@ -190,6 +202,7 @@ cpdef enum:
    N2_shape
    N2_ne_iob
    N2_ne_type
+    N2ss

    P1w
    P1W
@ -203,6 +216,7 @@ cpdef enum:
    P1_shape
    P1_ne_iob
    P1_ne_type
+    P1ss

    P2w
    P2W
@ -216,6 +230,7 @@ cpdef enum:
    P2_shape
    P2_ne_iob
    P2_ne_type
+    P2ss

    E0w
    E0W
@ -229,6 +244,7 @@ cpdef enum:
    E0_shape
    E0_ne_iob
    E0_ne_type
+    E0ss

    E1w
    E1W
@ -242,6 +258,7 @@ cpdef enum:
    E1_shape
    E1_ne_iob
    E1_ne_type
+    E1ss

    # Misc features at the end
    dist
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@ -18,6 +18,8 @@ from .stateclass cimport StateClass

 from cymem.cymem cimport Pool

+from ..cimport senses
+

 cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
    if token is NULL:
@ -33,6 +35,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
        context[9] = 0
        context[10] = 0
        context[11] = 0
+        context[12] = 0
    else:
        context[0] = token.lex.orth
        context[1] = token.lemma
@ -58,6 +61,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
        context[9] = token.lex.shape
        context[10] = token.ent_iob
        context[11] = token.ent_type
+        context[12] = token.lex.senses & senses.POS_SENSES[<int>token.pos]

 cdef int fill_context(atom_t* ctxt, StateClass st) nogil:
    # Take care to fill every element of context!
@ -250,6 +254,22 @@ unigrams = (
    (N0lW, N0lp),
    (N0lc6, N0lp),
    (N0lL,),
+
+    (S2ss,),
+    (S1ss,),
+    (S1rss,),
+    (S0lss,),
+    (S0l2ss,),
+    (S0ss,),
+    (S0r2ss,),
+    (S0rss,),
+    (N0lss,),
+    (N0l2ss,),
+    (N0ss,),
+    (N1ss,),
+    (N2ss,),
+    (P1ss,),
+    (P2ss,),
 )


@ -276,6 +296,7 @@ s0_n0 = (
    (S0p, N0lv, N0p),
    (S0c6, S0rL, S0r2L, N0p),
    (S0p, N0lL, N0l2L, N0p),
+    (S0ss, N0ss),
 )


@ -296,6 +317,7 @@ s1_s0 = (
    (S1L, S0L, S0p),
    (S1p, S1L, S0L, S0p),
    (S1p, S0p),
+    (S1ss, S0ss),
 )


@ -309,7 +331,8 @@ s1_n0 = (
    (S1c6, S1p, N0c6, N0p),
    (S1L, N0p),
    (S1p, S1rL, N0p),
-    (S1p, S1rp, N0p)
+    (S1p, S1rp, N0p),
+    (S1ss, N0ss),
 )


@ -323,6 +346,7 @@ s0_n1 = (
    (S0c6, S0p, N1c6, N1p),
    (S0L, N1p),
    (S0p, S0rL, N1p),
+    (S0ss, N1ss),
 )


@ -334,6 +358,7 @@ n0_n1 = (
    (N0c6, N0p, N1c6, N1p),
    (N0c, N1c),
    (N0p, N1c),
+    (N0ss, N1ss),
 )

 tree_shape = (
@ -361,6 +386,7 @@ trigrams = (

    (N0W, N0p, N0lL, N0l2L),
    (N0p, N0lL, N0l2L),
+    (S1ss, S0ss, N0ss,), 
 )