* Work on greedy parser

This commit is contained in:
Matthew Honnibal 2014-12-17 03:19:43 +11:00
parent 95ccea03b2
commit d524dd306a
9 changed files with 3020 additions and 2864 deletions

File diff suppressed because it is too large Load Diff

View File

@ -17,6 +17,14 @@ from ._state cimport get_left, get_right
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
if token is NULL:
context[0] = 0
context[1] = 0
context[2] = 0
context[3] = 0
context[4] = 0
context[5] = 0
else:
context[0] = token.lex.sic
context[1] = token.pos
context[2] = token.lex.cluster
@ -40,30 +48,29 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
cdef int fill_context(atom_t* context, State* state) except -1:
# This fills in the basic properties of each of our "slot" tokens, e.g.
# word on top of the stack, word at the front of the buffer, etc.
cdef TokenC* n1 = get_n1(state)
fill_token(&context[S2w], get_s2(state))
fill_token(&context[S1w], get_s1(state))
#fill_token(&context[S1rw], get_right(state, get_s1(state), 0))
fill_token(&context[S0lw], get_left(state, get_s0(state), 0))
fill_token(&context[S0l2w], get_left(state, get_s0(state), 1))
fill_token(&context[S1rw], get_right(state, get_s1(state), 1))
fill_token(&context[S0lw], get_left(state, get_s0(state), 1))
fill_token(&context[S0l2w], get_left(state, get_s0(state), 2))
fill_token(&context[S0w], get_s0(state))
#fill_token(&context[S0r2w], get_right(state, get_s0(state), 1))
fill_token(&context[S0rw], get_right(state, get_s0(state), 0))
#fill_token(&context[N0lw], get_left(state, get_n0(state), 0))
#fill_token(&context[N0l2w], get_left(state, get_n0(state), 1))
fill_token(&context[S0r2w], get_right(state, get_s0(state), 2))
fill_token(&context[S0rw], get_right(state, get_s0(state), 1))
fill_token(&context[N0lw], get_left(state, get_n0(state), 0))
fill_token(&context[N0l2w], get_left(state, get_n0(state), 1))
fill_token(&context[N0w], get_n0(state))
#fill_token(&context[N1w], get_n1(state))
#fill_token(&context[N2w], get_n2(state))
fill_token(&context[N1w], get_n1(state))
fill_token(&context[N2w], get_n2(state))
#if state.stack_len >= 1:
# context[dist] = state.stack[0] - state.sent
#else:
# context[dist] = 0
#context[N0lv] = 0
#context[S0lv] = 0
#context[S0rv] = 0
#context[S1lv] = 0
#context[S1rv] = 0
if state.stack_len >= 1:
context[dist] = state.stack[0] - state.i
else:
context[dist] = 0
context[N0lv] = 0
context[S0lv] = 0
context[S0rv] = 0
context[S1lv] = 0
context[S1rv] = 0
arc_eager = (

File diff suppressed because it is too large Load Diff

View File

@ -7,15 +7,16 @@ from ..tokens cimport TokenC
cdef struct State:
TokenC* sent
int* stack
int i
int sent_len
int stack_len
cdef int add_dep(State *s, TokenC* head, TokenC* child, int label) except -1
cdef int add_dep(const State *s, const int head, const int child, const int label) except -1
cdef TokenC* pop_stack(State *s) except NULL
cdef int pop_stack(State *s) except -1
cdef int push_stack(State *s) except -1
@ -32,33 +33,35 @@ cdef inline TokenC* get_n0(const State* s) nogil:
cdef inline TokenC* get_n1(const State* s) nogil:
if s.i < (s.sent_len - 1):
return &s.sent[s.i+1]
if (s.i+1) >= s.sent_len:
return NULL
else:
return s.sent - 1
return &s.sent[s.i+1]
cdef inline TokenC* get_n2(const State* s) nogil:
if (s.i + 2) >= s.sent_len:
return NULL
else:
return &s.sent[s.i+2]
cdef inline TokenC* get_s0(const State *s) nogil:
return s.stack[0]
return &s.sent[s.stack[0]]
cdef inline TokenC* get_s1(const State *s) nogil:
# Rely on our padding to ensure we don't go out of bounds here
cdef TokenC** s1 = s.stack - 1
return s1[0]
return &s.sent[s.stack[-1]]
cdef inline TokenC* get_s2(const State *s) nogil:
# Rely on our padding to ensure we don't go out of bounds here
cdef TokenC** s2 = s.stack - 2
return s2[0]
return &s.sent[s.stack[-2]]
cdef TokenC* get_right(State* s, TokenC* head, int idx) nogil
cdef TokenC* get_left(State* s, TokenC* head, int idx) nogil
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil
cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil
cdef inline bint at_eol(const State *s) nogil:
return s.i >= s.sent_len
@ -68,10 +71,10 @@ cdef inline bint is_final(const State *s) nogil:
return at_eol(s) # The stack will be attached to root anyway
cdef int children_in_buffer(const State *s, const TokenC* target, list gold) except -1
cdef int head_in_buffer(const State *s, const TokenC* target, list gold) except -1
cdef int children_in_stack(const State *s, const TokenC* target, list gold) except -1
cdef int head_in_stack(const State *s, const TokenC*, list gold) except -1
cdef int children_in_buffer(const State *s, const int head, list gold) except -1
cdef int head_in_buffer(const State *s, const int child, list gold) except -1
cdef int children_in_stack(const State *s, const int head, list gold) except -1
cdef int head_in_stack(const State *s, const int child, list gold) except -1
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL
@ -81,5 +84,7 @@ cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
cdef int i
for i in range(32):
if bits & (1 << i):
n -= 1
if n < 1:
return i
return 0

View File

@ -5,104 +5,82 @@ from cymem.cymem cimport Pool
from ..lexeme cimport EMPTY_LEXEME
cdef int add_dep(State *s, TokenC* head, TokenC* child, int label) except -1:
child.head = head - child
child.dep_tag = label
cdef int add_dep(State *s, int head, int child, int label) except -1:
s.sent[child].head = head - child
s.sent[child].dep_tag = label
# Keep a bit-vector tracking child dependencies. If a word has a child at
# offset i from it, set that bit (tracking left and right separately)
if child > head:
head.r_kids |= 1 << child.head
s.sent[head].r_kids |= 1 << (-s.sent[child].head)
else:
head.l_kids |= 1 << (-child.head)
s.sent[head].l_kids |= 1 << s.sent[child].head
cdef TokenC* pop_stack(State *s) except NULL:
cdef int pop_stack(State *s) except -1:
assert s.stack_len >= 1
cdef TokenC* top = s.stack[0]
s.stack -= 1
s.stack_len -= 1
return top
s.stack -= 1
cdef int push_stack(State *s) except -1:
assert s.i < s.sent_len
s.stack += 1
s.stack[0] = &s.sent[s.i]
s.stack[0] = s.i
s.stack_len += 1
s.i += 1
cdef int children_in_buffer(const State *s, const TokenC* target, list gold) except -1:
cdef int children_in_buffer(const State *s, int head, list gold) except -1:
# Golds holds an array of head offsets --- the head of word i is i - golds[i]
# Iterate over the tokens of the queue, and check whether their gold head is
# our target
cdef int i
cdef int n = 0
cdef TokenC* buff_word
cdef TokenC* buff_head
cdef int buff_word_head_offset
for i in range(s.i, s.sent_len):
buff_word = &s.sent[i]
buff_word_head_offset = gold[i]
buff_head = buff_word + buff_word_head_offset
if buff_head == target:
if gold[i] == head:
n += 1
return n
cdef int head_in_buffer(const State *s, const TokenC* target, list gold) except -1:
cdef int target_idx = get_idx(s, target)
cdef int target_head_idx = target_idx + gold[target_idx]
return target_head_idx >= s.i
cdef int head_in_buffer(const State *s, const int child, list gold) except -1:
return gold[child] >= s.i
cdef int children_in_stack(const State *s, const TokenC* target, list gold) except -1:
if s.stack_len == 0:
return 0
cdef int children_in_stack(const State *s, const int head, list gold) except -1:
cdef int i
cdef int n = 0
cdef const TokenC* stack_word
cdef const TokenC* stack_word_head
cdef int stack_word_head_offset
for i in range(s.stack_len):
stack_word = (s.stack - i)[0]
stack_word_head_offset = gold[get_idx(s, stack_word)]
stack_word_head = (s.stack + stack_word_head_offset)[0]
if stack_word_head == target:
if gold[s.stack[-i]] == head:
n += 1
return n
cdef int head_in_stack(const State *s, const TokenC* target, list gold) except -1:
if s.stack_len == 0:
return 0
cdef int head_offset = gold[get_idx(s, target)]
cdef const TokenC* target_head = target + head_offset
cdef int head_in_stack(const State *s, const int child, list gold) except -1:
cdef int i
for i in range(s.stack_len):
if target_head == (s.stack - i)[0]:
if gold[child] == s.stack[-i]:
return 1
return 0
cdef TokenC* get_left(State* s, TokenC* head, int idx) nogil:
cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil:
cdef uint32_t kids = head.l_kids
if kids == 0:
return s.sent - 1
return NULL
cdef int offset = _nth_significant_bit(kids, idx)
cdef TokenC* child = head - offset
cdef const TokenC* child = head - offset
if child >= s.sent:
return child
else:
return s.sent - 1
cdef TokenC* get_right(State* s, TokenC* head, int idx) nogil:
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil:
cdef uint32_t kids = head.r_kids
if kids == 0:
return s.sent - 1
return NULL
cdef int offset = _nth_significant_bit(kids, idx)
cdef TokenC* child = head + offset
cdef const TokenC* child = head + offset
if child < (s.sent + s.sent_len):
return child
else:
@ -115,13 +93,11 @@ DEF PADDING = 5
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL:
cdef int padded_len = sent_length + PADDING + PADDING
cdef State* s = <State*>mem.alloc(1, sizeof(State))
s.stack = <TokenC**>mem.alloc(padded_len, sizeof(TokenC*))
cdef TokenC* eol_token = sent - 1
s.stack = <int*>mem.alloc(padded_len, sizeof(int))
for i in range(PADDING):
# sent should be padded, with a suitable sentinel token here
s.stack[0] = eol_token
s.stack += 1
s.stack[0] = eol_token
s.stack[i] = -1
s.stack += (PADDING - 1)
assert s.stack[0] == -1
s.sent = sent
s.stack_len = 0
s.i = 0

File diff suppressed because it is too large Load Diff

View File

@ -35,43 +35,35 @@ cdef inline bint _can_reduce(const State* s) nogil:
cdef int _shift_cost(const State* s, list gold) except -1:
assert not at_eol(s)
cost = 0
cost += head_in_stack(s, get_n0(s), gold)
cost += children_in_stack(s, get_n0(s), gold)
cost += head_in_stack(s, s.i, gold)
cost += children_in_stack(s, s.i, gold)
return cost
cdef int _right_cost(const State* s, list gold) except -1:
assert s.stack_len >= 1
cdef int s0_idx = get_idx(s, get_s0(s))
cost = 0
if _gold_dep(s, get_s0(s), get_n0(s), gold):
if gold[s.i] == s.stack[0]:
return cost
cost += head_in_buffer(s, get_n0(s), gold)
cost += children_in_stack(s, get_n0(s), gold)
cost += head_in_stack(s, get_n0(s), gold)
cost += head_in_buffer(s, s.i, gold)
cost += children_in_stack(s, s.i, gold)
cost += head_in_stack(s, s.i, gold)
return cost
cdef int _left_cost(const State* s, list gold) except -1:
assert s.stack_len >= 1
cost = 0
if _gold_dep(s, get_n0(s), get_s0(s), gold):
if gold[s.stack[0]] == s.i:
return cost
cost += head_in_buffer(s, get_s0(s), gold)
cost += children_in_buffer(s, get_s0(s), gold)
cost += head_in_buffer(s, s.stack[0], gold)
cost += children_in_buffer(s, s.stack[0], gold)
return cost
cdef int _reduce_cost(const State* s, list gold) except -1:
return children_in_buffer(s, get_s0(s), gold)
cdef int _gold_dep(const State* s, const TokenC* head, const TokenC* child,
list gold_offsets) except -1:
cdef int head_idx = get_idx(s, head)
cdef int child_idx = get_idx(s, child)
return child_idx + gold_offsets[child_idx] == head_idx
return children_in_buffer(s, s.stack[0], gold)
cdef class TransitionSystem:
@ -109,10 +101,10 @@ cdef class TransitionSystem:
if t.move == SHIFT:
push_stack(s)
elif t.move == LEFT:
add_dep(s, get_n0(s), get_s0(s), t.label)
add_dep(s, s.i, s.stack[0], t.label)
pop_stack(s)
elif t.move == RIGHT:
add_dep(s, get_s0(s), get_n0(s), t.label)
add_dep(s, s.stack[0], s.i, t.label)
push_stack(s)
elif t.move == REDUCE:
pop_stack(s)
@ -157,12 +149,12 @@ cdef class TransitionSystem:
if move == SHIFT or move == REDUCE:
cost = 0
elif move == LEFT:
if _gold_dep(s, get_n0(s), get_s0(s), gold_heads):
cost = label != gold_labels[get_idx(s, get_s0(s))]
if gold_heads[s.stack[0]] == s.i:
cost = label != gold_labels[s.stack[0]]
else:
cost = 0
elif move == RIGHT:
if _gold_dep(s, get_s0(s), get_n0(s), gold_heads):
if gold_heads[s.i] == s.stack[0]:
cost = label != gold_labels[s.i]
else:
cost = 0
@ -173,24 +165,14 @@ cdef class TransitionSystem:
score = scores[i]
if best < 0:
for i in range(self.n_moves):
if self._moves[i].move == LEFT:
print self._moves[i].label,
print
print _gold_dep(s, get_n0(s), get_s0(s), gold_heads)
print gold_labels[get_idx(s, get_s0(s))]
print unl_costs[LEFT]
print "S0:"
print "Head:", gold_heads[get_idx(s, get_s0(s))]
print "h. in b.", head_in_buffer(s, get_s0(s), gold_heads)
print "c. in b.", children_in_buffer(s, get_s0(s), gold_heads)
print "h. in s.", head_in_stack(s, get_s0(s), gold_heads)
print "c. in s.", children_in_stack(s, get_s0(s), gold_heads)
print "N0:"
print "Head:", gold_heads[get_idx(s, get_n0(s))]
print "h. in b.", head_in_buffer(s, get_n0(s), gold_heads)
print "c. in b.", children_in_buffer(s, get_n0(s), gold_heads)
print "h. in s.", head_in_stack(s, get_n0(s), gold_heads)
print "c. in s.", children_in_stack(s, get_n0(s), gold_heads)
print unl_costs[SHIFT], unl_costs[REDUCE], unl_costs[LEFT], unl_costs[RIGHT]
print s.stack_len
print has_head(get_s0(s))
print s.sent[s.stack[0]].head
print s.stack[0], s.i
print gold_heads[s.stack[0]], gold_heads[s.i]
print gold_labels[s.i]
print children_in_buffer(s, s.stack[0], gold_heads)
print head_in_buffer(s, s.stack[0], gold_heads)
raise StandardError
return best

File diff suppressed because it is too large Load Diff

View File

@ -32,9 +32,6 @@ from . import _parse_features
from ._parse_features cimport fill_context, CONTEXT_SIZE
DEF CONTEXT_SIZE = 50
DEBUG = False
def set_debug(val):
global DEBUG
@ -43,8 +40,8 @@ def set_debug(val):
cdef unicode print_state(State* s, list words):
words = list(words) + ['EOL']
top = words[get_idx(s, get_s0(s))]
second = words[get_idx(s, get_s1(s))]
top = words[s.stack[0]]
second = words[s.stack[-1]]
n0 = words[s.i]
n1 = words[s.i + 1]
return ' '.join((second, top, '|', n0, n1))
@ -61,7 +58,7 @@ cdef class GreedyParser:
self.extractor = Extractor(get_templates(self.cfg.features))
self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels)
self.model = LinearModel(self.moves.n_moves, self.extractor.n_templ)
self.model = LinearModel(self.moves.n_moves, self.extractor.n_templ + 10000)
if os.path.exists(pjoin(model_dir, 'model')):
self.model.load(pjoin(model_dir, 'model'))
@ -94,7 +91,12 @@ cdef class GreedyParser:
cdef Pool mem = Pool()
cdef State* state = init_state(mem, tokens.data, tokens.length)
words = [t.string for t in tokens]
if DEBUG:
print words
print gold_heads
while not is_final(state):
if DEBUG:
print print_state(state, words)
fill_context(context, state)
feats = self.extractor.get_feats(context, &n_feats)
scores = self.model.get_scores(feats, n_feats)
@ -109,5 +111,5 @@ cdef class GreedyParser:
cdef int i
n_corr = 0
for i in range(tokens.length):
n_corr += state.sent[i].head == gold_heads[i]
n_corr += (i + state.sent[i].head) == gold_heads[i]
return n_corr