* Work on greedy parser

This commit is contained in:
Matthew Honnibal 2014-12-17 03:19:43 +11:00
parent 95ccea03b2
commit d524dd306a
9 changed files with 3020 additions and 2864 deletions

File diff suppressed because it is too large Load Diff

View File

@ -17,6 +17,14 @@ from ._state cimport get_left, get_right
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
if token is NULL:
context[0] = 0
context[1] = 0
context[2] = 0
context[3] = 0
context[4] = 0
context[5] = 0
else:
context[0] = token.lex.sic context[0] = token.lex.sic
context[1] = token.pos context[1] = token.pos
context[2] = token.lex.cluster context[2] = token.lex.cluster
@ -40,30 +48,29 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
cdef int fill_context(atom_t* context, State* state) except -1: cdef int fill_context(atom_t* context, State* state) except -1:
# This fills in the basic properties of each of our "slot" tokens, e.g. # This fills in the basic properties of each of our "slot" tokens, e.g.
# word on top of the stack, word at the front of the buffer, etc. # word on top of the stack, word at the front of the buffer, etc.
cdef TokenC* n1 = get_n1(state)
fill_token(&context[S2w], get_s2(state)) fill_token(&context[S2w], get_s2(state))
fill_token(&context[S1w], get_s1(state)) fill_token(&context[S1w], get_s1(state))
#fill_token(&context[S1rw], get_right(state, get_s1(state), 0)) fill_token(&context[S1rw], get_right(state, get_s1(state), 1))
fill_token(&context[S0lw], get_left(state, get_s0(state), 0)) fill_token(&context[S0lw], get_left(state, get_s0(state), 1))
fill_token(&context[S0l2w], get_left(state, get_s0(state), 1)) fill_token(&context[S0l2w], get_left(state, get_s0(state), 2))
fill_token(&context[S0w], get_s0(state)) fill_token(&context[S0w], get_s0(state))
#fill_token(&context[S0r2w], get_right(state, get_s0(state), 1)) fill_token(&context[S0r2w], get_right(state, get_s0(state), 2))
fill_token(&context[S0rw], get_right(state, get_s0(state), 0)) fill_token(&context[S0rw], get_right(state, get_s0(state), 1))
#fill_token(&context[N0lw], get_left(state, get_n0(state), 0)) fill_token(&context[N0lw], get_left(state, get_n0(state), 0))
#fill_token(&context[N0l2w], get_left(state, get_n0(state), 1)) fill_token(&context[N0l2w], get_left(state, get_n0(state), 1))
fill_token(&context[N0w], get_n0(state)) fill_token(&context[N0w], get_n0(state))
#fill_token(&context[N1w], get_n1(state)) fill_token(&context[N1w], get_n1(state))
#fill_token(&context[N2w], get_n2(state)) fill_token(&context[N2w], get_n2(state))
#if state.stack_len >= 1: if state.stack_len >= 1:
# context[dist] = state.stack[0] - state.sent context[dist] = state.stack[0] - state.i
#else: else:
# context[dist] = 0 context[dist] = 0
#context[N0lv] = 0 context[N0lv] = 0
#context[S0lv] = 0 context[S0lv] = 0
#context[S0rv] = 0 context[S0rv] = 0
#context[S1lv] = 0 context[S1lv] = 0
#context[S1rv] = 0 context[S1rv] = 0
arc_eager = ( arc_eager = (

File diff suppressed because it is too large Load Diff

View File

@ -7,15 +7,16 @@ from ..tokens cimport TokenC
cdef struct State: cdef struct State:
TokenC* sent TokenC* sent
int* stack
int i int i
int sent_len int sent_len
int stack_len int stack_len
cdef int add_dep(State *s, TokenC* head, TokenC* child, int label) except -1 cdef int add_dep(const State *s, const int head, const int child, const int label) except -1
cdef TokenC* pop_stack(State *s) except NULL cdef int pop_stack(State *s) except -1
cdef int push_stack(State *s) except -1 cdef int push_stack(State *s) except -1
@ -32,33 +33,35 @@ cdef inline TokenC* get_n0(const State* s) nogil:
cdef inline TokenC* get_n1(const State* s) nogil: cdef inline TokenC* get_n1(const State* s) nogil:
if s.i < (s.sent_len - 1): if (s.i+1) >= s.sent_len:
return &s.sent[s.i+1] return NULL
else: else:
return s.sent - 1 return &s.sent[s.i+1]
cdef inline TokenC* get_n2(const State* s) nogil: cdef inline TokenC* get_n2(const State* s) nogil:
if (s.i + 2) >= s.sent_len:
return NULL
else:
return &s.sent[s.i+2] return &s.sent[s.i+2]
cdef inline TokenC* get_s0(const State *s) nogil: cdef inline TokenC* get_s0(const State *s) nogil:
return s.stack[0] return &s.sent[s.stack[0]]
cdef inline TokenC* get_s1(const State *s) nogil: cdef inline TokenC* get_s1(const State *s) nogil:
# Rely on our padding to ensure we don't go out of bounds here # Rely on our padding to ensure we don't go out of bounds here
cdef TokenC** s1 = s.stack - 1 return &s.sent[s.stack[-1]]
return s1[0]
cdef inline TokenC* get_s2(const State *s) nogil: cdef inline TokenC* get_s2(const State *s) nogil:
# Rely on our padding to ensure we don't go out of bounds here # Rely on our padding to ensure we don't go out of bounds here
cdef TokenC** s2 = s.stack - 2 return &s.sent[s.stack[-2]]
return s2[0]
cdef TokenC* get_right(State* s, TokenC* head, int idx) nogil cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil
cdef TokenC* get_left(State* s, TokenC* head, int idx) nogil
cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil
cdef inline bint at_eol(const State *s) nogil: cdef inline bint at_eol(const State *s) nogil:
return s.i >= s.sent_len return s.i >= s.sent_len
@ -68,10 +71,10 @@ cdef inline bint is_final(const State *s) nogil:
return at_eol(s) # The stack will be attached to root anyway return at_eol(s) # The stack will be attached to root anyway
cdef int children_in_buffer(const State *s, const TokenC* target, list gold) except -1 cdef int children_in_buffer(const State *s, const int head, list gold) except -1
cdef int head_in_buffer(const State *s, const TokenC* target, list gold) except -1 cdef int head_in_buffer(const State *s, const int child, list gold) except -1
cdef int children_in_stack(const State *s, const TokenC* target, list gold) except -1 cdef int children_in_stack(const State *s, const int head, list gold) except -1
cdef int head_in_stack(const State *s, const TokenC*, list gold) except -1 cdef int head_in_stack(const State *s, const int child, list gold) except -1
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL
@ -81,5 +84,7 @@ cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
cdef int i cdef int i
for i in range(32): for i in range(32):
if bits & (1 << i): if bits & (1 << i):
n -= 1
if n < 1:
return i return i
return 0 return 0

View File

@ -5,104 +5,82 @@ from cymem.cymem cimport Pool
from ..lexeme cimport EMPTY_LEXEME from ..lexeme cimport EMPTY_LEXEME
cdef int add_dep(State *s, TokenC* head, TokenC* child, int label) except -1: cdef int add_dep(State *s, int head, int child, int label) except -1:
child.head = head - child s.sent[child].head = head - child
child.dep_tag = label s.sent[child].dep_tag = label
# Keep a bit-vector tracking child dependencies. If a word has a child at # Keep a bit-vector tracking child dependencies. If a word has a child at
# offset i from it, set that bit (tracking left and right separately) # offset i from it, set that bit (tracking left and right separately)
if child > head: if child > head:
head.r_kids |= 1 << child.head s.sent[head].r_kids |= 1 << (-s.sent[child].head)
else: else:
head.l_kids |= 1 << (-child.head) s.sent[head].l_kids |= 1 << s.sent[child].head
cdef TokenC* pop_stack(State *s) except NULL: cdef int pop_stack(State *s) except -1:
assert s.stack_len >= 1 assert s.stack_len >= 1
cdef TokenC* top = s.stack[0]
s.stack -= 1
s.stack_len -= 1 s.stack_len -= 1
return top s.stack -= 1
cdef int push_stack(State *s) except -1: cdef int push_stack(State *s) except -1:
assert s.i < s.sent_len assert s.i < s.sent_len
s.stack += 1 s.stack += 1
s.stack[0] = &s.sent[s.i] s.stack[0] = s.i
s.stack_len += 1 s.stack_len += 1
s.i += 1 s.i += 1
cdef int children_in_buffer(const State *s, const TokenC* target, list gold) except -1: cdef int children_in_buffer(const State *s, int head, list gold) except -1:
# Golds holds an array of head offsets --- the head of word i is i - golds[i] # Golds holds an array of head offsets --- the head of word i is i - golds[i]
# Iterate over the tokens of the queue, and check whether their gold head is # Iterate over the tokens of the queue, and check whether their gold head is
# our target # our target
cdef int i cdef int i
cdef int n = 0 cdef int n = 0
cdef TokenC* buff_word
cdef TokenC* buff_head
cdef int buff_word_head_offset
for i in range(s.i, s.sent_len): for i in range(s.i, s.sent_len):
buff_word = &s.sent[i] if gold[i] == head:
buff_word_head_offset = gold[i]
buff_head = buff_word + buff_word_head_offset
if buff_head == target:
n += 1 n += 1
return n return n
cdef int head_in_buffer(const State *s, const TokenC* target, list gold) except -1: cdef int head_in_buffer(const State *s, const int child, list gold) except -1:
cdef int target_idx = get_idx(s, target) return gold[child] >= s.i
cdef int target_head_idx = target_idx + gold[target_idx]
return target_head_idx >= s.i
cdef int children_in_stack(const State *s, const TokenC* target, list gold) except -1: cdef int children_in_stack(const State *s, const int head, list gold) except -1:
if s.stack_len == 0:
return 0
cdef int i cdef int i
cdef int n = 0 cdef int n = 0
cdef const TokenC* stack_word
cdef const TokenC* stack_word_head
cdef int stack_word_head_offset
for i in range(s.stack_len): for i in range(s.stack_len):
stack_word = (s.stack - i)[0] if gold[s.stack[-i]] == head:
stack_word_head_offset = gold[get_idx(s, stack_word)]
stack_word_head = (s.stack + stack_word_head_offset)[0]
if stack_word_head == target:
n += 1 n += 1
return n return n
cdef int head_in_stack(const State *s, const TokenC* target, list gold) except -1: cdef int head_in_stack(const State *s, const int child, list gold) except -1:
if s.stack_len == 0:
return 0
cdef int head_offset = gold[get_idx(s, target)]
cdef const TokenC* target_head = target + head_offset
cdef int i cdef int i
for i in range(s.stack_len): for i in range(s.stack_len):
if target_head == (s.stack - i)[0]: if gold[child] == s.stack[-i]:
return 1 return 1
return 0 return 0
cdef TokenC* get_left(State* s, TokenC* head, int idx) nogil: cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil:
cdef uint32_t kids = head.l_kids cdef uint32_t kids = head.l_kids
if kids == 0: if kids == 0:
return s.sent - 1 return NULL
cdef int offset = _nth_significant_bit(kids, idx) cdef int offset = _nth_significant_bit(kids, idx)
cdef TokenC* child = head - offset cdef const TokenC* child = head - offset
if child >= s.sent: if child >= s.sent:
return child return child
else: else:
return s.sent - 1 return s.sent - 1
cdef TokenC* get_right(State* s, TokenC* head, int idx) nogil: cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil:
cdef uint32_t kids = head.r_kids cdef uint32_t kids = head.r_kids
if kids == 0: if kids == 0:
return s.sent - 1 return NULL
cdef int offset = _nth_significant_bit(kids, idx) cdef int offset = _nth_significant_bit(kids, idx)
cdef TokenC* child = head + offset cdef const TokenC* child = head + offset
if child < (s.sent + s.sent_len): if child < (s.sent + s.sent_len):
return child return child
else: else:
@ -115,13 +93,11 @@ DEF PADDING = 5
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL: cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL:
cdef int padded_len = sent_length + PADDING + PADDING cdef int padded_len = sent_length + PADDING + PADDING
cdef State* s = <State*>mem.alloc(1, sizeof(State)) cdef State* s = <State*>mem.alloc(1, sizeof(State))
s.stack = <TokenC**>mem.alloc(padded_len, sizeof(TokenC*)) s.stack = <int*>mem.alloc(padded_len, sizeof(int))
cdef TokenC* eol_token = sent - 1
for i in range(PADDING): for i in range(PADDING):
# sent should be padded, with a suitable sentinel token here s.stack[i] = -1
s.stack[0] = eol_token s.stack += (PADDING - 1)
s.stack += 1 assert s.stack[0] == -1
s.stack[0] = eol_token
s.sent = sent s.sent = sent
s.stack_len = 0 s.stack_len = 0
s.i = 0 s.i = 0

File diff suppressed because it is too large Load Diff

View File

@ -35,43 +35,35 @@ cdef inline bint _can_reduce(const State* s) nogil:
cdef int _shift_cost(const State* s, list gold) except -1: cdef int _shift_cost(const State* s, list gold) except -1:
assert not at_eol(s) assert not at_eol(s)
cost = 0 cost = 0
cost += head_in_stack(s, get_n0(s), gold) cost += head_in_stack(s, s.i, gold)
cost += children_in_stack(s, get_n0(s), gold) cost += children_in_stack(s, s.i, gold)
return cost return cost
cdef int _right_cost(const State* s, list gold) except -1: cdef int _right_cost(const State* s, list gold) except -1:
assert s.stack_len >= 1 assert s.stack_len >= 1
cdef int s0_idx = get_idx(s, get_s0(s))
cost = 0 cost = 0
if _gold_dep(s, get_s0(s), get_n0(s), gold): if gold[s.i] == s.stack[0]:
return cost return cost
cost += head_in_buffer(s, get_n0(s), gold) cost += head_in_buffer(s, s.i, gold)
cost += children_in_stack(s, get_n0(s), gold) cost += children_in_stack(s, s.i, gold)
cost += head_in_stack(s, get_n0(s), gold) cost += head_in_stack(s, s.i, gold)
return cost return cost
cdef int _left_cost(const State* s, list gold) except -1: cdef int _left_cost(const State* s, list gold) except -1:
assert s.stack_len >= 1 assert s.stack_len >= 1
cost = 0 cost = 0
if _gold_dep(s, get_n0(s), get_s0(s), gold): if gold[s.stack[0]] == s.i:
return cost return cost
cost += head_in_buffer(s, get_s0(s), gold) cost += head_in_buffer(s, s.stack[0], gold)
cost += children_in_buffer(s, get_s0(s), gold) cost += children_in_buffer(s, s.stack[0], gold)
return cost return cost
cdef int _reduce_cost(const State* s, list gold) except -1: cdef int _reduce_cost(const State* s, list gold) except -1:
return children_in_buffer(s, get_s0(s), gold) return children_in_buffer(s, s.stack[0], gold)
cdef int _gold_dep(const State* s, const TokenC* head, const TokenC* child,
list gold_offsets) except -1:
cdef int head_idx = get_idx(s, head)
cdef int child_idx = get_idx(s, child)
return child_idx + gold_offsets[child_idx] == head_idx
cdef class TransitionSystem: cdef class TransitionSystem:
@ -109,10 +101,10 @@ cdef class TransitionSystem:
if t.move == SHIFT: if t.move == SHIFT:
push_stack(s) push_stack(s)
elif t.move == LEFT: elif t.move == LEFT:
add_dep(s, get_n0(s), get_s0(s), t.label) add_dep(s, s.i, s.stack[0], t.label)
pop_stack(s) pop_stack(s)
elif t.move == RIGHT: elif t.move == RIGHT:
add_dep(s, get_s0(s), get_n0(s), t.label) add_dep(s, s.stack[0], s.i, t.label)
push_stack(s) push_stack(s)
elif t.move == REDUCE: elif t.move == REDUCE:
pop_stack(s) pop_stack(s)
@ -157,12 +149,12 @@ cdef class TransitionSystem:
if move == SHIFT or move == REDUCE: if move == SHIFT or move == REDUCE:
cost = 0 cost = 0
elif move == LEFT: elif move == LEFT:
if _gold_dep(s, get_n0(s), get_s0(s), gold_heads): if gold_heads[s.stack[0]] == s.i:
cost = label != gold_labels[get_idx(s, get_s0(s))] cost = label != gold_labels[s.stack[0]]
else: else:
cost = 0 cost = 0
elif move == RIGHT: elif move == RIGHT:
if _gold_dep(s, get_s0(s), get_n0(s), gold_heads): if gold_heads[s.i] == s.stack[0]:
cost = label != gold_labels[s.i] cost = label != gold_labels[s.i]
else: else:
cost = 0 cost = 0
@ -173,24 +165,14 @@ cdef class TransitionSystem:
score = scores[i] score = scores[i]
if best < 0: if best < 0:
for i in range(self.n_moves): print unl_costs[SHIFT], unl_costs[REDUCE], unl_costs[LEFT], unl_costs[RIGHT]
if self._moves[i].move == LEFT: print s.stack_len
print self._moves[i].label, print has_head(get_s0(s))
print print s.sent[s.stack[0]].head
print _gold_dep(s, get_n0(s), get_s0(s), gold_heads) print s.stack[0], s.i
print gold_labels[get_idx(s, get_s0(s))] print gold_heads[s.stack[0]], gold_heads[s.i]
print unl_costs[LEFT] print gold_labels[s.i]
print "S0:" print children_in_buffer(s, s.stack[0], gold_heads)
print "Head:", gold_heads[get_idx(s, get_s0(s))] print head_in_buffer(s, s.stack[0], gold_heads)
print "h. in b.", head_in_buffer(s, get_s0(s), gold_heads)
print "c. in b.", children_in_buffer(s, get_s0(s), gold_heads)
print "h. in s.", head_in_stack(s, get_s0(s), gold_heads)
print "c. in s.", children_in_stack(s, get_s0(s), gold_heads)
print "N0:"
print "Head:", gold_heads[get_idx(s, get_n0(s))]
print "h. in b.", head_in_buffer(s, get_n0(s), gold_heads)
print "c. in b.", children_in_buffer(s, get_n0(s), gold_heads)
print "h. in s.", head_in_stack(s, get_n0(s), gold_heads)
print "c. in s.", children_in_stack(s, get_n0(s), gold_heads)
raise StandardError raise StandardError
return best return best

File diff suppressed because it is too large Load Diff

View File

@ -32,9 +32,6 @@ from . import _parse_features
from ._parse_features cimport fill_context, CONTEXT_SIZE from ._parse_features cimport fill_context, CONTEXT_SIZE
DEF CONTEXT_SIZE = 50
DEBUG = False DEBUG = False
def set_debug(val): def set_debug(val):
global DEBUG global DEBUG
@ -43,8 +40,8 @@ def set_debug(val):
cdef unicode print_state(State* s, list words): cdef unicode print_state(State* s, list words):
words = list(words) + ['EOL'] words = list(words) + ['EOL']
top = words[get_idx(s, get_s0(s))] top = words[s.stack[0]]
second = words[get_idx(s, get_s1(s))] second = words[s.stack[-1]]
n0 = words[s.i] n0 = words[s.i]
n1 = words[s.i + 1] n1 = words[s.i + 1]
return ' '.join((second, top, '|', n0, n1)) return ' '.join((second, top, '|', n0, n1))
@ -61,7 +58,7 @@ cdef class GreedyParser:
self.extractor = Extractor(get_templates(self.cfg.features)) self.extractor = Extractor(get_templates(self.cfg.features))
self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels) self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels)
self.model = LinearModel(self.moves.n_moves, self.extractor.n_templ) self.model = LinearModel(self.moves.n_moves, self.extractor.n_templ + 10000)
if os.path.exists(pjoin(model_dir, 'model')): if os.path.exists(pjoin(model_dir, 'model')):
self.model.load(pjoin(model_dir, 'model')) self.model.load(pjoin(model_dir, 'model'))
@ -94,7 +91,12 @@ cdef class GreedyParser:
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef State* state = init_state(mem, tokens.data, tokens.length) cdef State* state = init_state(mem, tokens.data, tokens.length)
words = [t.string for t in tokens] words = [t.string for t in tokens]
if DEBUG:
print words
print gold_heads
while not is_final(state): while not is_final(state):
if DEBUG:
print print_state(state, words)
fill_context(context, state) fill_context(context, state)
feats = self.extractor.get_feats(context, &n_feats) feats = self.extractor.get_feats(context, &n_feats)
scores = self.model.get_scores(feats, n_feats) scores = self.model.get_scores(feats, n_feats)
@ -109,5 +111,5 @@ cdef class GreedyParser:
cdef int i cdef int i
n_corr = 0 n_corr = 0
for i in range(tokens.length): for i in range(tokens.length):
n_corr += state.sent[i].head == gold_heads[i] n_corr += (i + state.sent[i].head) == gold_heads[i]
return n_corr return n_corr