* Work on parser. Up to 92 UAS on YM labels

This commit is contained in:
Matthew Honnibal 2014-12-18 09:05:31 +11:00
parent 55de747bfc
commit 8446ebfbbb
8 changed files with 220 additions and 25 deletions

View File

@ -11,6 +11,7 @@ from .lexeme cimport Lexeme
from .tagger cimport Tagger
from .utf8string cimport StringStore, UniStr
from .morphology cimport Morphologizer
from .syntax.parser cimport GreedyParser
cdef union LexemesOrTokens:
@ -43,6 +44,7 @@ cdef class Language:
cpdef readonly Lexicon lexicon
cpdef readonly Tagger pos_tagger
cpdef readonly Morphologizer morphologizer
cpdef readonly GreedyParser parser
cdef PreshMap _pos_cache
cdef object _prefix_re

View File

@ -44,15 +44,19 @@ cdef class Language:
self.pos_tagger = None
self.morphologizer = None
def load(self, pos_dir=None):
def load(self, pos_dir=None, parser_dir=None):
self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
if pos_dir is None:
pos_dir = path.join(util.DATA_DIR, self.name, 'pos')
if parser_dir is None:
parser_dir = path.join(util.DATA_DIR, self.name, 'deps')
if path.exists(pos_dir):
self.pos_tagger = Tagger(pos_dir)
self.morphologizer = Morphologizer(self.lexicon.strings, pos_dir)
#self.load_pos_cache(path.join(util.DATA_DIR, self.name, 'pos', 'bigram_cache_2m'))
if path.exists(parser_dir):
self.parser = GreedyParser(parser_dir)
cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings])

View File

@ -22,6 +22,7 @@ cdef int fill_context(atom_t* context, State* state) except -1
# NB: The order of the enum is _NOT_ arbitrary!!
cpdef enum:
S2w
S2W
S2p
S2c
S2c4
@ -29,6 +30,7 @@ cpdef enum:
S2L
S1w
S1W
S1p
S1c
S1c4
@ -36,6 +38,7 @@ cpdef enum:
S1L
S1rw
S1rW
S1rp
S1rc
S1rc4
@ -43,6 +46,7 @@ cpdef enum:
S1rL
S0lw
S0lW
S0lp
S0lc
S0lc4
@ -50,6 +54,7 @@ cpdef enum:
S0lL
S0l2w
S0l2W
S0l2p
S0l2c
S0l2c4
@ -57,6 +62,7 @@ cpdef enum:
S0l2L
S0w
S0W
S0p
S0c
S0c4
@ -64,6 +70,7 @@ cpdef enum:
S0L
S0r2w
S0r2W
S0r2p
S0r2c
S0r2c4
@ -71,6 +78,7 @@ cpdef enum:
S0r2L
S0rw
S0rW
S0rp
S0rc
S0rc4
@ -78,6 +86,7 @@ cpdef enum:
S0rL
N0l2w
N0l2W
N0l2p
N0l2c
N0l2c4
@ -85,6 +94,7 @@ cpdef enum:
N0l2L
N0lw
N0lW
N0lp
N0lc
N0lc4
@ -92,6 +102,7 @@ cpdef enum:
N0lL
N0w
N0W
N0p
N0c
N0c4
@ -99,6 +110,7 @@ cpdef enum:
N0L
N1w
N1W
N1p
N1c
N1c4
@ -106,6 +118,7 @@ cpdef enum:
N1L
N2w
N2W
N2p
N2c
N2c4
@ -120,4 +133,8 @@ cpdef enum:
S1lv
S1rv
S0_has_head
S1_has_head
S2_has_head
CONTEXT_SIZE

View File

@ -13,7 +13,8 @@ from itertools import combinations
from ..tokens cimport TokenC
from ._state cimport State
from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2
from ._state cimport get_left, get_right
from ._state cimport has_head, get_left, get_right
from ._state cimport count_left_kids, count_right_kids
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
@ -24,10 +25,12 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
context[3] = 0
context[4] = 0
context[5] = 0
context[6] = 0
else:
context[0] = token.lex.sic
context[1] = token.pos
context[2] = token.lex.cluster
context[1] = token.lemma
context[2] = token.pos
context[3] = token.lex.cluster
# We've read in the string little-endian, so now we can take & (2**n)-1
# to get the first n bits of the cluster.
# e.g. s = "1110010101"
@ -40,9 +43,9 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
# What we're doing here is picking a number where all bits are 1, e.g.
# 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in
# the source that are set to 1.
context[3] = token.lex.cluster & 63
context[4] = token.lex.cluster & 15
context[5] = token.dep_tag
context[4] = token.lex.cluster & 63
context[5] = token.lex.cluster & 15
context[6] = token.dep_tag
cdef int fill_context(atom_t* context, State* state) except -1:
@ -66,11 +69,159 @@ cdef int fill_context(atom_t* context, State* state) except -1:
context[dist] = state.stack[0] - state.i
else:
context[dist] = 0
context[N0lv] = 0
context[S0lv] = 0
context[S0rv] = 0
context[S1lv] = 0
context[S1rv] = 0
context[N0lv] = max(count_left_kids(get_n0(state)), 5)
context[S0lv] = max(count_left_kids(get_s0(state)), 5)
context[S0rv] = max(count_right_kids(get_s0(state)), 5)
context[S1lv] = max(count_left_kids(get_s1(state)), 5)
context[S1rv] = max(count_right_kids(get_s1(state)), 5)
context[S0_has_head] = 0
context[S1_has_head] = 0
context[S2_has_head] = 0
if state.stack_len >= 1:
context[S0_has_head] = has_head(get_s0(state)) + 1
if state.stack_len >= 2:
context[S1_has_head] = has_head(get_s1(state)) + 1
if state.stack_len >= 3:
context[S2_has_head] = has_head(get_s2(state))
unigrams = (
(S2W, S2p),
(S2p,),
(S2c,),
(S2L,),
(S1W, S1p),
(S1p,),
(S1c,),
(S1L,),
(S0W, S0p),
(S0p,),
(S0c,),
(S0L,),
(N0W, N0p),
(N0p,),
(N0c,),
(N0L,),
(N1W, N1p),
(N1p,),
(N1c,),
(N2W, N2p),
(N2p,),
(N2c,),
(S0r2W, S0r2p),
(S0r2p,),
(S0r2c,),
(S0r2L,),
(S0rW, S0rp),
(S0rp,),
(S0rc,),
(S0rL,),
(S0l2W, S0l2p),
(S0l2p,),
(S0l2c,),
(S0l2L,),
(S0lW, S0lp),
(S0lp,),
(S0lc,),
(S0lL,),
(N0l2W, N0l2p),
(N0l2p,),
(N0l2c,),
(N0l2L,),
(N0lW, N0lp),
(N0lp,),
(N0lc,),
(N0lL,),
)
s0_n0 = (
(S0W, S0p, N0W, N0p),
(S0c, S0p, N0c, N0p),
(S0p, N0p),
(S0W, N0p),
(S0p, N0W),
(S0W, N0c),
(S0c, N0W),
(S0p, N0c),
(S0c, N0p),
(S0W, S0rp, N0p),
(S0p, S0rp, N0p),
(S0p, N0lp, N0W),
(S0p, N0lp, N0p),
)
s1_n0 = (
(S0_has_head, S1p, N0p),
(S0_has_head, S1c, N0c),
(S0_has_head, S1c, N0p),
(S0_has_head, S1p, N0c),
(S0_has_head, S1W, S1p, N0p),
(S0_has_head, S1p, N0W, N0p)
)
s0_n1 = (
(S0p, N1p),
(S0c, N1c),
(S0c, N1p),
(S0p, N1c),
(S0W, S0p, N1p),
(S0p, N1W, N1p)
)
n0_n1 = (
(N0W, N0p, N1W, N1p),
(N0W, N0p, N1p),
(N0p, N1W, N1p),
(N0c, N0p, N1c, N1p),
(N0c, N1c),
(N0p, N1c),
)
tree_shape = (
(dist,),
(S0p, S0_has_head, S1_has_head, S2_has_head),
(S0p, S0lv, S0rv),
(N0p, N0lv),
#(S0p, S0_left_shape),
#(S0p, S0_right_shape),
#(N0p, N0_left_shape),
#(S0p, S0_left_shape, N0_left_shape)
)
trigrams = (
(N0p, N1p, N2p),
(S0p, S0lp, S0l2p),
(S0p, S0rp, S0r2p),
(S0p, S1p, S2p),
(S1p, S0p, N0p),
(S0p, S0lp, N0p),
(S0p, N0p, N0lp),
(N0p, N0lp, N0l2p),
(S0W, S0p, S0rL, S0r2L),
(S0p, S0rL, S0r2L),
(S0W, S0p, S0lL, S0l2L),
(S0p, S0lL, S0l2L),
(N0W, N0p, N0lL, N0l2L),
(N0p, N0lL, N0l2L),
)
arc_eager = (
@ -86,7 +237,6 @@ arc_eager = (
(N2w, N2p),
(N2w,),
(N2p,),
(S0w, S0p, N0w, N0p),
(S0w, S0p, N0w),
(S0w, N0w, N0p),

View File

@ -20,8 +20,7 @@ cdef int pop_stack(State *s) except -1
cdef int push_stack(State *s) except -1
cdef inline bint has_head(const TokenC* t) nogil:
return t.head != 0
cdef bint has_head(const TokenC* t) nogil
cdef inline int get_idx(const State* s, const TokenC* t) nogil:
@ -79,12 +78,10 @@ cdef int head_in_stack(const State *s, const int child, int* gold) except -1
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL
cdef inline int count_left_kids(const TokenC* head) nogil:
return _popcount(head.l_kids)
cdef int count_left_kids(const TokenC* head) nogil
cdef inline int count_right_kids(const TokenC* head) nogil:
return _popcount(head.r_kids)
cdef int count_right_kids(const TokenC* head) nogil
# From https://en.wikipedia.org/wiki/Hamming_weight

View File

@ -3,6 +3,7 @@ from libc.string cimport memmove
from cymem.cymem cimport Pool
from ..lexeme cimport EMPTY_LEXEME
from ..tokens cimport TokenC
cdef int add_dep(State *s, int head, int child, int label) except -1:
@ -88,6 +89,19 @@ cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx)
return NULL
cdef bint has_head(const TokenC* t) nogil:
return t.head != 0
cdef int count_left_kids(const TokenC* head) nogil:
return _popcount(head.l_kids)
cdef int count_right_kids(const TokenC* head) nogil:
return _popcount(head.r_kids)
DEF PADDING = 5

View File

@ -138,6 +138,16 @@ cdef class TransitionSystem:
unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1
unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1
#s0_buff_head = head_in_buffer(s, get_s0(s), gold_heads)
#s0_stack_head = head_in_stack(s, get_s0(s), gold_heads)
#s0_buff_kids = children_in_buffer(s, get_s0(s), gold_heads)
#s0_stack_kids = children_in_stack(s, get_s0(s), gold_heads)
#n0_buff_head = head_in_buffer(s, get_n0(s), gold_heads)
#n0_stack_head = head_in_stack(s, get_n0(s), gold_heads)
#n0_buff_kids = children_in_buffer(s, get_n0(s), gold_heads)
#n0_stack_kids = children_in_buffer(s, get_n0(s), gold_heads)
cdef int cost
cdef int move
cdef int label

View File

@ -49,7 +49,11 @@ cdef unicode print_state(State* s, list words):
def get_templates(name):
pf = _parse_features
if name == 'zhang':
return pf.arc_eager
else:
templs = pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + pf.tree_shape + pf.trigrams
return templs
cdef class GreedyParser:
@ -58,7 +62,6 @@ cdef class GreedyParser:
self.cfg = Config.read(model_dir, 'config')
self.extractor = Extractor(get_templates(self.cfg.features))
self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels)
self.model = LinearModel(self.moves.n_moves, self.extractor.n_templ)
if os.path.exists(pjoin(model_dir, 'model')):
self.model.load(pjoin(model_dir, 'model'))
@ -77,9 +80,7 @@ cdef class GreedyParser:
fill_context(context, state)
feats = self.extractor.get_feats(context, &n_feats)
scores = self.model.get_scores(feats, n_feats)
guess = self.moves.best_valid(scores, state)
self.moves.transition(state, guess)
return 0
@ -110,7 +111,7 @@ cdef class GreedyParser:
self.moves.transition(state, guess)
cdef int n_corr = 0
for i in range(tokens.length):
n_corr += (i + state.sent[i].head) == heads_array[i]
n_corr += (i + state.sent[i].head) == gold_heads[i]
return n_corr