* Work on parser. Up to 92 UAS on YM labels

2026-03-01 10:21:28 +03:00 · 2014-12-18 09:05:31 +11:00 · 2014-12-18 09:05:31 +11:00 · 8446ebfbbb
commit 8446ebfbbb
parent 55de747bfc
8 changed files with 220 additions and 25 deletions
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -11,6 +11,7 @@ from .lexeme cimport Lexeme
 from .tagger cimport Tagger
 from .utf8string cimport StringStore, UniStr
 from .morphology cimport Morphologizer
+from .syntax.parser cimport GreedyParser


 cdef union LexemesOrTokens:
@ -43,6 +44,7 @@ cdef class Language:
    cpdef readonly Lexicon lexicon
    cpdef readonly Tagger pos_tagger
    cpdef readonly Morphologizer morphologizer
+    cpdef readonly GreedyParser parser

    cdef PreshMap _pos_cache
    cdef object _prefix_re
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -44,15 +44,19 @@ cdef class Language:
        self.pos_tagger = None
        self.morphologizer = None

-    def load(self, pos_dir=None):
+    def load(self, pos_dir=None, parser_dir=None):
        self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
        self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
        if pos_dir is None:
            pos_dir = path.join(util.DATA_DIR, self.name, 'pos')
+        if parser_dir is None:
+            parser_dir = path.join(util.DATA_DIR, self.name, 'deps')
        if path.exists(pos_dir):
            self.pos_tagger = Tagger(pos_dir)
            self.morphologizer = Morphologizer(self.lexicon.strings, pos_dir)
            #self.load_pos_cache(path.join(util.DATA_DIR, self.name, 'pos', 'bigram_cache_2m'))
+        if path.exists(parser_dir):
+            self.parser = GreedyParser(parser_dir)

    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
--- a/spacy/syntax/_parse_features.pxd
+++ b/spacy/syntax/_parse_features.pxd
@ -22,6 +22,7 @@ cdef int fill_context(atom_t* context, State* state) except -1
 # NB: The order of the enum is _NOT_ arbitrary!!
 cpdef enum:
    S2w
+    S2W
    S2p
    S2c
    S2c4
@ -29,6 +30,7 @@ cpdef enum:
    S2L

    S1w
+    S1W
    S1p
    S1c
    S1c4
@ -36,6 +38,7 @@ cpdef enum:
    S1L

    S1rw
+    S1rW
    S1rp
    S1rc
    S1rc4
@ -43,6 +46,7 @@ cpdef enum:
    S1rL

    S0lw
+    S0lW
    S0lp
    S0lc
    S0lc4
@ -50,6 +54,7 @@ cpdef enum:
    S0lL

    S0l2w
+    S0l2W
    S0l2p
    S0l2c
    S0l2c4
@ -57,6 +62,7 @@ cpdef enum:
    S0l2L

    S0w
+    S0W
    S0p
    S0c
    S0c4
@ -64,6 +70,7 @@ cpdef enum:
    S0L
    
    S0r2w
+    S0r2W
    S0r2p
    S0r2c
    S0r2c4
@ -71,6 +78,7 @@ cpdef enum:
    S0r2L

    S0rw
+    S0rW
    S0rp
    S0rc
    S0rc4
@ -78,6 +86,7 @@ cpdef enum:
    S0rL

    N0l2w
+    N0l2W
    N0l2p
    N0l2c
    N0l2c4
@ -85,6 +94,7 @@ cpdef enum:
    N0l2L

    N0lw
+    N0lW
    N0lp
    N0lc
    N0lc4
@ -92,6 +102,7 @@ cpdef enum:
    N0lL

    N0w
+    N0W
    N0p
    N0c
    N0c4
@ -99,6 +110,7 @@ cpdef enum:
    N0L
 
    N1w
+    N1W
    N1p
    N1c
    N1c4
@ -106,6 +118,7 @@ cpdef enum:
    N1L
    
    N2w
+    N2W
    N2p
    N2c
    N2c4
@ -119,5 +132,9 @@ cpdef enum:
    S0rv
    S1lv
    S1rv
-    
+
+    S0_has_head
+    S1_has_head
+    S2_has_head
+
    CONTEXT_SIZE
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@ -13,7 +13,8 @@ from itertools import combinations
 from ..tokens cimport TokenC
 from ._state cimport State
 from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2
-from ._state cimport get_left, get_right
+from ._state cimport has_head, get_left, get_right
+from ._state cimport count_left_kids, count_right_kids


 cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
@ -24,10 +25,12 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
        context[3] = 0
        context[4] = 0
        context[5] = 0
+        context[6] = 0
    else:
        context[0] = token.lex.sic
-        context[1] = token.pos
-        context[2] = token.lex.cluster
+        context[1] = token.lemma
+        context[2] = token.pos
+        context[3] = token.lex.cluster
        # We've read in the string little-endian, so now we can take & (2**n)-1
        # to get the first n bits of the cluster.
        # e.g. s = "1110010101"
@ -40,9 +43,9 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
        # What we're doing here is picking a number where all bits are 1, e.g.
        # 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in
        # the source that are set to 1.
-        context[3] = token.lex.cluster & 63
-        context[4] = token.lex.cluster & 15
-        context[5] = token.dep_tag
+        context[4] = token.lex.cluster & 63
+        context[5] = token.lex.cluster & 15
+        context[6] = token.dep_tag


 cdef int fill_context(atom_t* context, State* state) except -1:
@ -66,12 +69,160 @@ cdef int fill_context(atom_t* context, State* state) except -1:
        context[dist] = state.stack[0] - state.i
    else:
        context[dist] = 0
-    context[N0lv] = 0 
-    context[S0lv] = 0
-    context[S0rv] = 0
-    context[S1lv] = 0
-    context[S1rv] = 0
+    context[N0lv] = max(count_left_kids(get_n0(state)), 5)
+    context[S0lv] = max(count_left_kids(get_s0(state)), 5)
+    context[S0rv] = max(count_right_kids(get_s0(state)), 5)
+    context[S1lv] = max(count_left_kids(get_s1(state)), 5)
+    context[S1rv] = max(count_right_kids(get_s1(state)), 5)

+    context[S0_has_head] = 0
+    context[S1_has_head] = 0
+    context[S2_has_head] = 0
+    if state.stack_len >= 1:
+        context[S0_has_head] = has_head(get_s0(state)) + 1
+        if state.stack_len >= 2:
+            context[S1_has_head] = has_head(get_s1(state)) + 1
+            if state.stack_len >= 3:
+                context[S2_has_head] = has_head(get_s2(state))
+
+
+unigrams = (
+    (S2W, S2p),
+    (S2p,),
+    (S2c,),
+    (S2L,),
+    
+    (S1W, S1p),
+    (S1p,),
+    (S1c,),
+    (S1L,),
+
+    (S0W, S0p),
+    (S0p,),
+    (S0c,),
+    (S0L,),
+ 
+    (N0W, N0p),
+    (N0p,),
+    (N0c,),
+    (N0L,),
+ 
+    (N1W, N1p),
+    (N1p,),
+    (N1c,),
+ 
+    (N2W, N2p),
+    (N2p,),
+    (N2c,),
+
+    (S0r2W, S0r2p),
+    (S0r2p,),
+    (S0r2c,),
+    (S0r2L,),
+
+    (S0rW, S0rp),
+    (S0rp,),
+    (S0rc,),
+    (S0rL,),
+
+    (S0l2W, S0l2p),
+    (S0l2p,),
+    (S0l2c,),
+    (S0l2L,),
+
+    (S0lW, S0lp),
+    (S0lp,),
+    (S0lc,),
+    (S0lL,),
+
+    (N0l2W, N0l2p),
+    (N0l2p,),
+    (N0l2c,),
+    (N0l2L,),
+
+    (N0lW, N0lp),
+    (N0lp,),
+    (N0lc,),
+    (N0lL,),
+)
+
+
+s0_n0 = (
+    (S0W, S0p, N0W, N0p),
+    (S0c, S0p, N0c, N0p),
+    (S0p, N0p),
+    (S0W, N0p),
+    (S0p, N0W),
+    (S0W, N0c),
+    (S0c, N0W),
+    (S0p, N0c),
+    (S0c, N0p),
+    (S0W, S0rp, N0p),
+    (S0p, S0rp, N0p),
+    (S0p, N0lp, N0W),
+    (S0p, N0lp, N0p),
+)
+
+s1_n0 = (
+    (S0_has_head, S1p, N0p),
+    (S0_has_head, S1c, N0c),
+    (S0_has_head, S1c, N0p),
+    (S0_has_head, S1p, N0c),
+    (S0_has_head, S1W, S1p, N0p),
+    (S0_has_head, S1p, N0W, N0p)
+)
+
+
+s0_n1 = (
+    (S0p, N1p),
+    (S0c, N1c),
+    (S0c, N1p),
+    (S0p, N1c),
+    (S0W, S0p, N1p),
+    (S0p, N1W, N1p)
+)
+
+n0_n1 = (
+    (N0W, N0p, N1W, N1p),
+    (N0W, N0p, N1p),
+    (N0p, N1W, N1p),
+    (N0c, N0p, N1c, N1p),
+    (N0c, N1c),
+    (N0p, N1c),
+)
+
+tree_shape = (
+    (dist,),
+    (S0p, S0_has_head, S1_has_head, S2_has_head),
+    (S0p, S0lv, S0rv),
+    (N0p, N0lv),
+
+    #(S0p, S0_left_shape),
+    #(S0p, S0_right_shape),
+    #(N0p, N0_left_shape),
+    #(S0p, S0_left_shape, N0_left_shape)
+)
+
+trigrams = (
+    (N0p, N1p, N2p),
+    (S0p, S0lp, S0l2p),
+    (S0p, S0rp, S0r2p),
+    (S0p, S1p, S2p),
+    (S1p, S0p, N0p),
+    (S0p, S0lp, N0p),
+    (S0p, N0p, N0lp),
+    (N0p, N0lp, N0l2p),
+    
+    (S0W, S0p, S0rL, S0r2L),
+    (S0p, S0rL, S0r2L),
+
+    (S0W, S0p, S0lL, S0l2L),
+    (S0p, S0lL, S0l2L),
+
+    (N0W, N0p, N0lL, N0l2L),
+    (N0p, N0lL, N0l2L),
+)
+ 

 arc_eager = (
    (S0w, S0p),
@ -86,7 +237,6 @@ arc_eager = (
    (N2w, N2p),
    (N2w,),
    (N2p,),
-
    (S0w, S0p, N0w, N0p),
    (S0w, S0p, N0w),
    (S0w, N0w, N0p),
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -20,8 +20,7 @@ cdef int pop_stack(State *s) except -1
 cdef int push_stack(State *s) except -1


-cdef inline bint has_head(const TokenC* t) nogil:
-    return t.head != 0
+cdef bint has_head(const TokenC* t) nogil


 cdef inline int get_idx(const State* s, const TokenC* t) nogil:
@ -79,12 +78,10 @@ cdef int head_in_stack(const State *s, const int child, int* gold) except -1
 cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL


-cdef inline int count_left_kids(const TokenC* head) nogil:
-    return _popcount(head.l_kids)
+cdef int count_left_kids(const TokenC* head) nogil


-cdef inline int count_right_kids(const TokenC* head) nogil:
-    return _popcount(head.r_kids)
+cdef int count_right_kids(const TokenC* head) nogil


 # From https://en.wikipedia.org/wiki/Hamming_weight
--- a/spacy/syntax/_state.pyx
+++ b/spacy/syntax/_state.pyx
@ -3,6 +3,7 @@ from libc.string cimport memmove
 from cymem.cymem cimport Pool

 from ..lexeme cimport EMPTY_LEXEME
+from ..tokens cimport TokenC


 cdef int add_dep(State *s, int head, int child, int label) except -1:
@ -88,6 +89,19 @@ cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx)
        return NULL


+cdef bint has_head(const TokenC* t) nogil:
+    return t.head != 0
+
+
+cdef int count_left_kids(const TokenC* head) nogil:
+    return _popcount(head.l_kids)
+
+
+cdef int count_right_kids(const TokenC* head) nogil:
+    return _popcount(head.r_kids)
+
+
+
 DEF PADDING = 5


--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -138,6 +138,16 @@ cdef class TransitionSystem:
        unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1
        unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1

+        #s0_buff_head = head_in_buffer(s, get_s0(s), gold_heads)
+        #s0_stack_head = head_in_stack(s, get_s0(s), gold_heads)
+        #s0_buff_kids = children_in_buffer(s, get_s0(s), gold_heads)
+        #s0_stack_kids = children_in_stack(s, get_s0(s), gold_heads)
+
+        #n0_buff_head = head_in_buffer(s, get_n0(s), gold_heads)
+        #n0_stack_head = head_in_stack(s, get_n0(s), gold_heads)
+        #n0_buff_kids = children_in_buffer(s, get_n0(s), gold_heads)
+        #n0_stack_kids = children_in_buffer(s, get_n0(s), gold_heads)
+
        cdef int cost
        cdef int move
        cdef int label
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -49,7 +49,11 @@ cdef unicode print_state(State* s, list words):

 def get_templates(name):
    pf = _parse_features
-    return pf.arc_eager
+    if name == 'zhang':
+        return pf.arc_eager
+    else:
+        templs = pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + pf.tree_shape + pf.trigrams
+        return templs


 cdef class GreedyParser:
@ -58,7 +62,6 @@ cdef class GreedyParser:
        self.cfg = Config.read(model_dir, 'config')
        self.extractor = Extractor(get_templates(self.cfg.features))
        self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels)
-        
        self.model = LinearModel(self.moves.n_moves, self.extractor.n_templ)
        if os.path.exists(pjoin(model_dir, 'model')):
            self.model.load(pjoin(model_dir, 'model'))
@ -77,9 +80,7 @@ cdef class GreedyParser:
            fill_context(context, state)
            feats = self.extractor.get_feats(context, &n_feats)
            scores = self.model.get_scores(feats, n_feats)
-
            guess = self.moves.best_valid(scores, state)
-            
            self.moves.transition(state, guess)
        return 0

@ -110,7 +111,7 @@ cdef class GreedyParser:
            self.moves.transition(state, guess)
        cdef int n_corr = 0
        for i in range(tokens.length):
-            n_corr += (i + state.sent[i].head) == heads_array[i]
+            n_corr += (i + state.sent[i].head) == gold_heads[i]
        return n_corr