* Work on constituency parsing.

This commit is contained in:
Matthew Honnibal 2015-05-20 16:02:51 +02:00
parent 7c8bf0eba5
commit 9dfc9c039c
2 changed files with 54 additions and 41 deletions

View File

@ -5,16 +5,15 @@ from cymem.cymem cimport Pool
from ..structs cimport TokenC, Entity, Constituent from ..structs cimport TokenC, Entity, Constituent
cdef struct State: cdef struct State:
TokenC* sent TokenC* sent
int* stack int* stack
Entity* ent Entity* ent
Constituent* ctnt
int i int i
int sent_len int sent_len
int stack_len int stack_len
int ents_len int ents_len
int ctnt_len
cdef int add_dep(const State *s, const int head, const int child, const int label) except -1 cdef int add_dep(const State *s, const int head, const int child, const int label) except -1

View File

@ -183,32 +183,37 @@ cdef int _do_break(const Transition* self, State* state) except -1:
cdef int _do_constituent(const Transition* self, State* state) except -1: cdef int _do_constituent(const Transition* self, State* state) except -1:
cdef const TokenC* s0 = get_s0(state) cdef Constituent* bracket = new_bracket(state.ctnts)
if state.ctnt.head == get_idx(state, s0):
start = state.ctnt.start bracket.parent = NULL
else: bracket.label = self.label
start = get_idx(state, s0) bracket.head = get_s0(state)
state.ctnt += 1 bracket.length = 0
state.ctnt.start = start
state.ctnt.end = s0.r_edge attach(bracket, state.ctnts.stack)
state.ctnt.head = get_idx(state, s0) # Attach rightward children. They're in the brackets array somewhere
state.ctnt.label = self.label # between here and B0.
cdef Constituent* node
cdef const TokenC* node_gov
for i in range(1, bracket - state.ctnts.stack):
node = bracket - i
node_gov = node.head + node.head.head
if node_gov == bracket.head:
attach(bracket, node)
cdef int _do_adjust(const Transition* self, State* state) except -1: cdef int _do_adjust(const Transition* self, State* state) except -1:
cdef const TokenC* child cdef Constituent* b0 = state.ctnts.stack[0]
cdef const TokenC* s0 = get_s0(state) cdef Constituent* b1 = state.ctnts.stack[1]
cdef int n_left = count_left_kids(s0)
for i in range(1, n_left): assert (b1.head + b1.head.head) == b0.head
child = get_left(state, s0, i) assert b0.head < b1.head
assert child is not NULL assert b0 < b1
if child.l_edge < state.ctnt.start:
state.ctnt.start = child.l_edge attach(b0, b1)
break # Pop B1 from stack, but keep B0 on top
else: state.ctnts.stack -= 1
msg = ("Error moving bracket --- Move should be invalid if " state.ctnts.stack[0] = b0
"no left edge to move to.")
raise Exception(msg)
do_funcs[SHIFT] = _do_shift do_funcs[SHIFT] = _do_shift
@ -374,14 +379,14 @@ cdef inline bint _can_right(const State* s) nogil:
cdef inline bint _can_left(const State* s) nogil: cdef inline bint _can_left(const State* s) nogil:
if NON_MONOTONIC: if NON_MONOTONIC:
return s.stack_len >= 1 return s.stack_len >= 1 and not missing_brackets(s)
else: else:
return s.stack_len >= 1 and not has_head(get_s0(s)) return s.stack_len >= 1 and not has_head(get_s0(s))
cdef inline bint _can_reduce(const State* s) nogil: cdef inline bint _can_reduce(const State* s) nogil:
if NON_MONOTONIC: if NON_MONOTONIC:
return s.stack_len >= 2 return s.stack_len >= 2 and not missing_brackets(s)
else: else:
return s.stack_len >= 2 and has_head(get_s0(s)) return s.stack_len >= 2 and has_head(get_s0(s))
@ -401,24 +406,33 @@ cdef inline bint _can_break(const State* s) nogil:
return False return False
else: else:
seen_headless = True seen_headless = True
# TODO: Constituency constraints
return True return True
cdef inline bint _can_constituent(const State* s) nogil: cdef inline bint _can_constituent(const State* s) nogil:
if s.stack_len < 1:
return False
else:
# If all stack elements are popped, can't constituent
for i in range(s.ctnts.stack_len):
if not s.ctnts.is_popped[-i]:
return True
else:
return False return False
#return s.stack_len >= 1
cdef inline bint _can_adjust(const State* s) nogil: cdef inline bint _can_adjust(const State* s) nogil:
if s.ctnts.stack_len < 2:
return False return False
# Need a left child to move the bracket to
#cdef const TokenC* child cdef const Constituent* b1 = s.ctnts.stack[-1]
#cdef const TokenC* s0 = get_s0(s) cdef const Constituent* b0 = s.ctnts.stack[0]
#cdef int n_left = count_left_kids(s0)
#cdef int i if (b1.head + b1.head.head) != b0.head:
#for i in range(1, n_left): return False
# child = get_left(s, s0, i) elif b0.head >= b1.head:
# if child.l_edge < s.ctnt.start: return False
# return True elif b0 >= b1:
#else: return False
# return False return True