mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Work on greedy parser
This commit is contained in:
parent
95ccea03b2
commit
d524dd306a
File diff suppressed because it is too large
Load Diff
|
@ -17,6 +17,14 @@ from ._state cimport get_left, get_right
|
|||
|
||||
|
||||
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||
if token is NULL:
|
||||
context[0] = 0
|
||||
context[1] = 0
|
||||
context[2] = 0
|
||||
context[3] = 0
|
||||
context[4] = 0
|
||||
context[5] = 0
|
||||
else:
|
||||
context[0] = token.lex.sic
|
||||
context[1] = token.pos
|
||||
context[2] = token.lex.cluster
|
||||
|
@ -40,30 +48,29 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
|||
cdef int fill_context(atom_t* context, State* state) except -1:
|
||||
# This fills in the basic properties of each of our "slot" tokens, e.g.
|
||||
# word on top of the stack, word at the front of the buffer, etc.
|
||||
cdef TokenC* n1 = get_n1(state)
|
||||
fill_token(&context[S2w], get_s2(state))
|
||||
fill_token(&context[S1w], get_s1(state))
|
||||
#fill_token(&context[S1rw], get_right(state, get_s1(state), 0))
|
||||
fill_token(&context[S0lw], get_left(state, get_s0(state), 0))
|
||||
fill_token(&context[S0l2w], get_left(state, get_s0(state), 1))
|
||||
fill_token(&context[S1rw], get_right(state, get_s1(state), 1))
|
||||
fill_token(&context[S0lw], get_left(state, get_s0(state), 1))
|
||||
fill_token(&context[S0l2w], get_left(state, get_s0(state), 2))
|
||||
fill_token(&context[S0w], get_s0(state))
|
||||
#fill_token(&context[S0r2w], get_right(state, get_s0(state), 1))
|
||||
fill_token(&context[S0rw], get_right(state, get_s0(state), 0))
|
||||
#fill_token(&context[N0lw], get_left(state, get_n0(state), 0))
|
||||
#fill_token(&context[N0l2w], get_left(state, get_n0(state), 1))
|
||||
fill_token(&context[S0r2w], get_right(state, get_s0(state), 2))
|
||||
fill_token(&context[S0rw], get_right(state, get_s0(state), 1))
|
||||
fill_token(&context[N0lw], get_left(state, get_n0(state), 0))
|
||||
fill_token(&context[N0l2w], get_left(state, get_n0(state), 1))
|
||||
fill_token(&context[N0w], get_n0(state))
|
||||
#fill_token(&context[N1w], get_n1(state))
|
||||
#fill_token(&context[N2w], get_n2(state))
|
||||
fill_token(&context[N1w], get_n1(state))
|
||||
fill_token(&context[N2w], get_n2(state))
|
||||
|
||||
#if state.stack_len >= 1:
|
||||
# context[dist] = state.stack[0] - state.sent
|
||||
#else:
|
||||
# context[dist] = 0
|
||||
#context[N0lv] = 0
|
||||
#context[S0lv] = 0
|
||||
#context[S0rv] = 0
|
||||
#context[S1lv] = 0
|
||||
#context[S1rv] = 0
|
||||
if state.stack_len >= 1:
|
||||
context[dist] = state.stack[0] - state.i
|
||||
else:
|
||||
context[dist] = 0
|
||||
context[N0lv] = 0
|
||||
context[S0lv] = 0
|
||||
context[S0rv] = 0
|
||||
context[S1lv] = 0
|
||||
context[S1rv] = 0
|
||||
|
||||
|
||||
arc_eager = (
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -7,15 +7,16 @@ from ..tokens cimport TokenC
|
|||
|
||||
cdef struct State:
|
||||
TokenC* sent
|
||||
int* stack
|
||||
int i
|
||||
int sent_len
|
||||
int stack_len
|
||||
|
||||
|
||||
cdef int add_dep(State *s, TokenC* head, TokenC* child, int label) except -1
|
||||
cdef int add_dep(const State *s, const int head, const int child, const int label) except -1
|
||||
|
||||
|
||||
cdef TokenC* pop_stack(State *s) except NULL
|
||||
cdef int pop_stack(State *s) except -1
|
||||
cdef int push_stack(State *s) except -1
|
||||
|
||||
|
||||
|
@ -32,33 +33,35 @@ cdef inline TokenC* get_n0(const State* s) nogil:
|
|||
|
||||
|
||||
cdef inline TokenC* get_n1(const State* s) nogil:
|
||||
if s.i < (s.sent_len - 1):
|
||||
return &s.sent[s.i+1]
|
||||
if (s.i+1) >= s.sent_len:
|
||||
return NULL
|
||||
else:
|
||||
return s.sent - 1
|
||||
return &s.sent[s.i+1]
|
||||
|
||||
|
||||
cdef inline TokenC* get_n2(const State* s) nogil:
|
||||
if (s.i + 2) >= s.sent_len:
|
||||
return NULL
|
||||
else:
|
||||
return &s.sent[s.i+2]
|
||||
|
||||
|
||||
cdef inline TokenC* get_s0(const State *s) nogil:
|
||||
return s.stack[0]
|
||||
return &s.sent[s.stack[0]]
|
||||
|
||||
|
||||
cdef inline TokenC* get_s1(const State *s) nogil:
|
||||
# Rely on our padding to ensure we don't go out of bounds here
|
||||
cdef TokenC** s1 = s.stack - 1
|
||||
return s1[0]
|
||||
return &s.sent[s.stack[-1]]
|
||||
|
||||
|
||||
cdef inline TokenC* get_s2(const State *s) nogil:
|
||||
# Rely on our padding to ensure we don't go out of bounds here
|
||||
cdef TokenC** s2 = s.stack - 2
|
||||
return s2[0]
|
||||
return &s.sent[s.stack[-2]]
|
||||
|
||||
cdef TokenC* get_right(State* s, TokenC* head, int idx) nogil
|
||||
cdef TokenC* get_left(State* s, TokenC* head, int idx) nogil
|
||||
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil
|
||||
|
||||
cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil
|
||||
|
||||
cdef inline bint at_eol(const State *s) nogil:
|
||||
return s.i >= s.sent_len
|
||||
|
@ -68,10 +71,10 @@ cdef inline bint is_final(const State *s) nogil:
|
|||
return at_eol(s) # The stack will be attached to root anyway
|
||||
|
||||
|
||||
cdef int children_in_buffer(const State *s, const TokenC* target, list gold) except -1
|
||||
cdef int head_in_buffer(const State *s, const TokenC* target, list gold) except -1
|
||||
cdef int children_in_stack(const State *s, const TokenC* target, list gold) except -1
|
||||
cdef int head_in_stack(const State *s, const TokenC*, list gold) except -1
|
||||
cdef int children_in_buffer(const State *s, const int head, list gold) except -1
|
||||
cdef int head_in_buffer(const State *s, const int child, list gold) except -1
|
||||
cdef int children_in_stack(const State *s, const int head, list gold) except -1
|
||||
cdef int head_in_stack(const State *s, const int child, list gold) except -1
|
||||
|
||||
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL
|
||||
|
||||
|
@ -81,5 +84,7 @@ cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
|
|||
cdef int i
|
||||
for i in range(32):
|
||||
if bits & (1 << i):
|
||||
n -= 1
|
||||
if n < 1:
|
||||
return i
|
||||
return 0
|
||||
|
|
|
@ -5,104 +5,82 @@ from cymem.cymem cimport Pool
|
|||
from ..lexeme cimport EMPTY_LEXEME
|
||||
|
||||
|
||||
cdef int add_dep(State *s, TokenC* head, TokenC* child, int label) except -1:
|
||||
child.head = head - child
|
||||
child.dep_tag = label
|
||||
cdef int add_dep(State *s, int head, int child, int label) except -1:
|
||||
s.sent[child].head = head - child
|
||||
s.sent[child].dep_tag = label
|
||||
# Keep a bit-vector tracking child dependencies. If a word has a child at
|
||||
# offset i from it, set that bit (tracking left and right separately)
|
||||
if child > head:
|
||||
head.r_kids |= 1 << child.head
|
||||
s.sent[head].r_kids |= 1 << (-s.sent[child].head)
|
||||
else:
|
||||
head.l_kids |= 1 << (-child.head)
|
||||
s.sent[head].l_kids |= 1 << s.sent[child].head
|
||||
|
||||
|
||||
cdef TokenC* pop_stack(State *s) except NULL:
|
||||
cdef int pop_stack(State *s) except -1:
|
||||
assert s.stack_len >= 1
|
||||
cdef TokenC* top = s.stack[0]
|
||||
s.stack -= 1
|
||||
s.stack_len -= 1
|
||||
return top
|
||||
s.stack -= 1
|
||||
|
||||
|
||||
cdef int push_stack(State *s) except -1:
|
||||
assert s.i < s.sent_len
|
||||
s.stack += 1
|
||||
s.stack[0] = &s.sent[s.i]
|
||||
s.stack[0] = s.i
|
||||
s.stack_len += 1
|
||||
s.i += 1
|
||||
|
||||
|
||||
cdef int children_in_buffer(const State *s, const TokenC* target, list gold) except -1:
|
||||
cdef int children_in_buffer(const State *s, int head, list gold) except -1:
|
||||
# Golds holds an array of head offsets --- the head of word i is i - golds[i]
|
||||
# Iterate over the tokens of the queue, and check whether their gold head is
|
||||
# our target
|
||||
cdef int i
|
||||
cdef int n = 0
|
||||
cdef TokenC* buff_word
|
||||
cdef TokenC* buff_head
|
||||
cdef int buff_word_head_offset
|
||||
for i in range(s.i, s.sent_len):
|
||||
buff_word = &s.sent[i]
|
||||
buff_word_head_offset = gold[i]
|
||||
buff_head = buff_word + buff_word_head_offset
|
||||
if buff_head == target:
|
||||
if gold[i] == head:
|
||||
n += 1
|
||||
return n
|
||||
|
||||
|
||||
cdef int head_in_buffer(const State *s, const TokenC* target, list gold) except -1:
|
||||
cdef int target_idx = get_idx(s, target)
|
||||
cdef int target_head_idx = target_idx + gold[target_idx]
|
||||
return target_head_idx >= s.i
|
||||
cdef int head_in_buffer(const State *s, const int child, list gold) except -1:
|
||||
return gold[child] >= s.i
|
||||
|
||||
|
||||
cdef int children_in_stack(const State *s, const TokenC* target, list gold) except -1:
|
||||
if s.stack_len == 0:
|
||||
return 0
|
||||
cdef int children_in_stack(const State *s, const int head, list gold) except -1:
|
||||
cdef int i
|
||||
cdef int n = 0
|
||||
cdef const TokenC* stack_word
|
||||
cdef const TokenC* stack_word_head
|
||||
cdef int stack_word_head_offset
|
||||
for i in range(s.stack_len):
|
||||
stack_word = (s.stack - i)[0]
|
||||
stack_word_head_offset = gold[get_idx(s, stack_word)]
|
||||
stack_word_head = (s.stack + stack_word_head_offset)[0]
|
||||
if stack_word_head == target:
|
||||
if gold[s.stack[-i]] == head:
|
||||
n += 1
|
||||
return n
|
||||
|
||||
|
||||
cdef int head_in_stack(const State *s, const TokenC* target, list gold) except -1:
|
||||
if s.stack_len == 0:
|
||||
return 0
|
||||
cdef int head_offset = gold[get_idx(s, target)]
|
||||
cdef const TokenC* target_head = target + head_offset
|
||||
cdef int head_in_stack(const State *s, const int child, list gold) except -1:
|
||||
cdef int i
|
||||
for i in range(s.stack_len):
|
||||
if target_head == (s.stack - i)[0]:
|
||||
if gold[child] == s.stack[-i]:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
cdef TokenC* get_left(State* s, TokenC* head, int idx) nogil:
|
||||
cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil:
|
||||
cdef uint32_t kids = head.l_kids
|
||||
if kids == 0:
|
||||
return s.sent - 1
|
||||
return NULL
|
||||
cdef int offset = _nth_significant_bit(kids, idx)
|
||||
cdef TokenC* child = head - offset
|
||||
cdef const TokenC* child = head - offset
|
||||
if child >= s.sent:
|
||||
return child
|
||||
else:
|
||||
return s.sent - 1
|
||||
|
||||
|
||||
cdef TokenC* get_right(State* s, TokenC* head, int idx) nogil:
|
||||
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil:
|
||||
cdef uint32_t kids = head.r_kids
|
||||
if kids == 0:
|
||||
return s.sent - 1
|
||||
return NULL
|
||||
cdef int offset = _nth_significant_bit(kids, idx)
|
||||
cdef TokenC* child = head + offset
|
||||
cdef const TokenC* child = head + offset
|
||||
if child < (s.sent + s.sent_len):
|
||||
return child
|
||||
else:
|
||||
|
@ -115,13 +93,11 @@ DEF PADDING = 5
|
|||
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL:
|
||||
cdef int padded_len = sent_length + PADDING + PADDING
|
||||
cdef State* s = <State*>mem.alloc(1, sizeof(State))
|
||||
s.stack = <TokenC**>mem.alloc(padded_len, sizeof(TokenC*))
|
||||
cdef TokenC* eol_token = sent - 1
|
||||
s.stack = <int*>mem.alloc(padded_len, sizeof(int))
|
||||
for i in range(PADDING):
|
||||
# sent should be padded, with a suitable sentinel token here
|
||||
s.stack[0] = eol_token
|
||||
s.stack += 1
|
||||
s.stack[0] = eol_token
|
||||
s.stack[i] = -1
|
||||
s.stack += (PADDING - 1)
|
||||
assert s.stack[0] == -1
|
||||
s.sent = sent
|
||||
s.stack_len = 0
|
||||
s.i = 0
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -35,43 +35,35 @@ cdef inline bint _can_reduce(const State* s) nogil:
|
|||
cdef int _shift_cost(const State* s, list gold) except -1:
|
||||
assert not at_eol(s)
|
||||
cost = 0
|
||||
cost += head_in_stack(s, get_n0(s), gold)
|
||||
cost += children_in_stack(s, get_n0(s), gold)
|
||||
cost += head_in_stack(s, s.i, gold)
|
||||
cost += children_in_stack(s, s.i, gold)
|
||||
return cost
|
||||
|
||||
|
||||
cdef int _right_cost(const State* s, list gold) except -1:
|
||||
assert s.stack_len >= 1
|
||||
cdef int s0_idx = get_idx(s, get_s0(s))
|
||||
cost = 0
|
||||
if _gold_dep(s, get_s0(s), get_n0(s), gold):
|
||||
if gold[s.i] == s.stack[0]:
|
||||
return cost
|
||||
cost += head_in_buffer(s, get_n0(s), gold)
|
||||
cost += children_in_stack(s, get_n0(s), gold)
|
||||
cost += head_in_stack(s, get_n0(s), gold)
|
||||
cost += head_in_buffer(s, s.i, gold)
|
||||
cost += children_in_stack(s, s.i, gold)
|
||||
cost += head_in_stack(s, s.i, gold)
|
||||
return cost
|
||||
|
||||
|
||||
cdef int _left_cost(const State* s, list gold) except -1:
|
||||
assert s.stack_len >= 1
|
||||
cost = 0
|
||||
if _gold_dep(s, get_n0(s), get_s0(s), gold):
|
||||
if gold[s.stack[0]] == s.i:
|
||||
return cost
|
||||
|
||||
cost += head_in_buffer(s, get_s0(s), gold)
|
||||
cost += children_in_buffer(s, get_s0(s), gold)
|
||||
cost += head_in_buffer(s, s.stack[0], gold)
|
||||
cost += children_in_buffer(s, s.stack[0], gold)
|
||||
return cost
|
||||
|
||||
|
||||
cdef int _reduce_cost(const State* s, list gold) except -1:
|
||||
return children_in_buffer(s, get_s0(s), gold)
|
||||
|
||||
|
||||
cdef int _gold_dep(const State* s, const TokenC* head, const TokenC* child,
|
||||
list gold_offsets) except -1:
|
||||
cdef int head_idx = get_idx(s, head)
|
||||
cdef int child_idx = get_idx(s, child)
|
||||
return child_idx + gold_offsets[child_idx] == head_idx
|
||||
return children_in_buffer(s, s.stack[0], gold)
|
||||
|
||||
|
||||
cdef class TransitionSystem:
|
||||
|
@ -109,10 +101,10 @@ cdef class TransitionSystem:
|
|||
if t.move == SHIFT:
|
||||
push_stack(s)
|
||||
elif t.move == LEFT:
|
||||
add_dep(s, get_n0(s), get_s0(s), t.label)
|
||||
add_dep(s, s.i, s.stack[0], t.label)
|
||||
pop_stack(s)
|
||||
elif t.move == RIGHT:
|
||||
add_dep(s, get_s0(s), get_n0(s), t.label)
|
||||
add_dep(s, s.stack[0], s.i, t.label)
|
||||
push_stack(s)
|
||||
elif t.move == REDUCE:
|
||||
pop_stack(s)
|
||||
|
@ -157,12 +149,12 @@ cdef class TransitionSystem:
|
|||
if move == SHIFT or move == REDUCE:
|
||||
cost = 0
|
||||
elif move == LEFT:
|
||||
if _gold_dep(s, get_n0(s), get_s0(s), gold_heads):
|
||||
cost = label != gold_labels[get_idx(s, get_s0(s))]
|
||||
if gold_heads[s.stack[0]] == s.i:
|
||||
cost = label != gold_labels[s.stack[0]]
|
||||
else:
|
||||
cost = 0
|
||||
elif move == RIGHT:
|
||||
if _gold_dep(s, get_s0(s), get_n0(s), gold_heads):
|
||||
if gold_heads[s.i] == s.stack[0]:
|
||||
cost = label != gold_labels[s.i]
|
||||
else:
|
||||
cost = 0
|
||||
|
@ -173,24 +165,14 @@ cdef class TransitionSystem:
|
|||
score = scores[i]
|
||||
|
||||
if best < 0:
|
||||
for i in range(self.n_moves):
|
||||
if self._moves[i].move == LEFT:
|
||||
print self._moves[i].label,
|
||||
print
|
||||
print _gold_dep(s, get_n0(s), get_s0(s), gold_heads)
|
||||
print gold_labels[get_idx(s, get_s0(s))]
|
||||
print unl_costs[LEFT]
|
||||
print "S0:"
|
||||
print "Head:", gold_heads[get_idx(s, get_s0(s))]
|
||||
print "h. in b.", head_in_buffer(s, get_s0(s), gold_heads)
|
||||
print "c. in b.", children_in_buffer(s, get_s0(s), gold_heads)
|
||||
print "h. in s.", head_in_stack(s, get_s0(s), gold_heads)
|
||||
print "c. in s.", children_in_stack(s, get_s0(s), gold_heads)
|
||||
print "N0:"
|
||||
print "Head:", gold_heads[get_idx(s, get_n0(s))]
|
||||
print "h. in b.", head_in_buffer(s, get_n0(s), gold_heads)
|
||||
print "c. in b.", children_in_buffer(s, get_n0(s), gold_heads)
|
||||
print "h. in s.", head_in_stack(s, get_n0(s), gold_heads)
|
||||
print "c. in s.", children_in_stack(s, get_n0(s), gold_heads)
|
||||
print unl_costs[SHIFT], unl_costs[REDUCE], unl_costs[LEFT], unl_costs[RIGHT]
|
||||
print s.stack_len
|
||||
print has_head(get_s0(s))
|
||||
print s.sent[s.stack[0]].head
|
||||
print s.stack[0], s.i
|
||||
print gold_heads[s.stack[0]], gold_heads[s.i]
|
||||
print gold_labels[s.i]
|
||||
print children_in_buffer(s, s.stack[0], gold_heads)
|
||||
print head_in_buffer(s, s.stack[0], gold_heads)
|
||||
raise StandardError
|
||||
return best
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -32,9 +32,6 @@ from . import _parse_features
|
|||
from ._parse_features cimport fill_context, CONTEXT_SIZE
|
||||
|
||||
|
||||
DEF CONTEXT_SIZE = 50
|
||||
|
||||
|
||||
DEBUG = False
|
||||
def set_debug(val):
|
||||
global DEBUG
|
||||
|
@ -43,8 +40,8 @@ def set_debug(val):
|
|||
|
||||
cdef unicode print_state(State* s, list words):
|
||||
words = list(words) + ['EOL']
|
||||
top = words[get_idx(s, get_s0(s))]
|
||||
second = words[get_idx(s, get_s1(s))]
|
||||
top = words[s.stack[0]]
|
||||
second = words[s.stack[-1]]
|
||||
n0 = words[s.i]
|
||||
n1 = words[s.i + 1]
|
||||
return ' '.join((second, top, '|', n0, n1))
|
||||
|
@ -61,7 +58,7 @@ cdef class GreedyParser:
|
|||
self.extractor = Extractor(get_templates(self.cfg.features))
|
||||
self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels)
|
||||
|
||||
self.model = LinearModel(self.moves.n_moves, self.extractor.n_templ)
|
||||
self.model = LinearModel(self.moves.n_moves, self.extractor.n_templ + 10000)
|
||||
if os.path.exists(pjoin(model_dir, 'model')):
|
||||
self.model.load(pjoin(model_dir, 'model'))
|
||||
|
||||
|
@ -94,7 +91,12 @@ cdef class GreedyParser:
|
|||
cdef Pool mem = Pool()
|
||||
cdef State* state = init_state(mem, tokens.data, tokens.length)
|
||||
words = [t.string for t in tokens]
|
||||
if DEBUG:
|
||||
print words
|
||||
print gold_heads
|
||||
while not is_final(state):
|
||||
if DEBUG:
|
||||
print print_state(state, words)
|
||||
fill_context(context, state)
|
||||
feats = self.extractor.get_feats(context, &n_feats)
|
||||
scores = self.model.get_scores(feats, n_feats)
|
||||
|
@ -109,5 +111,5 @@ cdef class GreedyParser:
|
|||
cdef int i
|
||||
n_corr = 0
|
||||
for i in range(tokens.length):
|
||||
n_corr += state.sent[i].head == gold_heads[i]
|
||||
n_corr += (i + state.sent[i].head) == gold_heads[i]
|
||||
return n_corr
|
||||
|
|
Loading…
Reference in New Issue
Block a user