mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-28 02:46:35 +03:00
* Fix r/l and distance features.
This commit is contained in:
parent
d51a86478e
commit
77d7e79c7e
|
@ -85,7 +85,7 @@ cdef int fill_context(atom_t* context, State* state) except -1:
|
||||||
fill_token(&context[E0w], get_e0(state))
|
fill_token(&context[E0w], get_e0(state))
|
||||||
fill_token(&context[E1w], get_e1(state))
|
fill_token(&context[E1w], get_e1(state))
|
||||||
if state.stack_len >= 1:
|
if state.stack_len >= 1:
|
||||||
context[dist] = min(state.stack[0] - state.i, 5)
|
context[dist] = min(state.i - state.stack[0], 5)
|
||||||
else:
|
else:
|
||||||
context[dist] = 0
|
context[dist] = 0
|
||||||
context[N0lv] = min(count_left_kids(get_n0(state)), 5)
|
context[N0lv] = min(count_left_kids(get_n0(state)), 5)
|
||||||
|
|
|
@ -89,9 +89,9 @@ cdef inline TokenC* get_s2(const State *s) nogil:
|
||||||
# Rely on our padding to ensure we don't go out of bounds here
|
# Rely on our padding to ensure we don't go out of bounds here
|
||||||
return &s.sent[s.stack[-2]]
|
return &s.sent[s.stack[-2]]
|
||||||
|
|
||||||
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil
|
cdef const TokenC* get_right(const State* s, const TokenC* head, int idx) nogil
|
||||||
|
|
||||||
cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil
|
cdef const TokenC* get_left(const State* s, const TokenC* head, int idx) nogil
|
||||||
|
|
||||||
cdef inline bint at_eol(const State *s) nogil:
|
cdef inline bint at_eol(const State *s) nogil:
|
||||||
return s.i >= s.sent_len
|
return s.i >= s.sent_len
|
||||||
|
|
|
@ -16,10 +16,8 @@ cdef int add_dep(State *s, int head, int child, int label) except -1:
|
||||||
cdef int dist = head - child
|
cdef int dist = head - child
|
||||||
s.sent[child].head = dist
|
s.sent[child].head = dist
|
||||||
s.sent[child].dep = label
|
s.sent[child].dep = label
|
||||||
# Keep a bit-vector tracking child dependencies. If a word has a child at
|
|
||||||
# offset i from it, set that bit (tracking left and right separately)
|
|
||||||
if child > head:
|
if child > head:
|
||||||
s.sent[head].r_kids |= 1 << (-dist)
|
s.sent[head].r_kids += 1
|
||||||
s.sent[head].r_edge = child - head
|
s.sent[head].r_edge = child - head
|
||||||
# Walk up the tree, setting right edge
|
# Walk up the tree, setting right edge
|
||||||
n_iter = 0
|
n_iter = 0
|
||||||
|
@ -34,7 +32,7 @@ cdef int add_dep(State *s, int head, int child, int label) except -1:
|
||||||
msg = msg % (start, child, tree)
|
msg = msg % (start, child, tree)
|
||||||
raise Exception(msg)
|
raise Exception(msg)
|
||||||
else:
|
else:
|
||||||
s.sent[head].l_kids |= 1 << dist
|
s.sent[head].l_kids += 1
|
||||||
s.sent[head].l_edge = (child + s.sent[child].l_edge) - head
|
s.sent[head].l_edge = (child + s.sent[child].l_edge) - head
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,14 +40,14 @@ cdef int del_dep(State *s, int head, int child) except -1:
|
||||||
cdef const TokenC* next_child
|
cdef const TokenC* next_child
|
||||||
cdef int dist = head - child
|
cdef int dist = head - child
|
||||||
if child > head:
|
if child > head:
|
||||||
s.sent[head].r_kids &= ~(1 << (-dist))
|
s.sent[head].r_kids -= 1
|
||||||
next_child = get_right(s, &s.sent[head], 1)
|
next_child = get_right(s, &s.sent[head], 1)
|
||||||
if next_child == NULL:
|
if next_child == NULL:
|
||||||
s.sent[head].r_edge = 0
|
s.sent[head].r_edge = 0
|
||||||
else:
|
else:
|
||||||
s.sent[head].r_edge = next_child.r_edge
|
s.sent[head].r_edge = next_child.r_edge
|
||||||
else:
|
else:
|
||||||
s.sent[head].l_kids &= ~(1 << dist)
|
s.sent[head].l_kids -= 1
|
||||||
next_child = get_left(s, &s.sent[head], 1)
|
next_child = get_left(s, &s.sent[head], 1)
|
||||||
if next_child == NULL:
|
if next_child == NULL:
|
||||||
s.sent[head].l_edge = 0
|
s.sent[head].l_edge = 0
|
||||||
|
@ -113,36 +111,60 @@ cdef bint has_head(const TokenC* t) nogil:
|
||||||
return t.head != 0
|
return t.head != 0
|
||||||
|
|
||||||
|
|
||||||
cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil:
|
cdef const TokenC* get_left(const State* s, const TokenC* target, int idx) nogil:
|
||||||
cdef uint32_t kids = head.l_kids
|
if target.l_kids == 0:
|
||||||
if kids == 0:
|
|
||||||
return NULL
|
return NULL
|
||||||
cdef int offset = _nth_significant_bit(kids, idx)
|
if idx > target.l_kids:
|
||||||
cdef const TokenC* child = head - offset
|
return NULL
|
||||||
if child >= s.sent:
|
if idx < 1:
|
||||||
return child
|
return NULL
|
||||||
|
cdef const TokenC* ptr = s.sent
|
||||||
|
while ptr < target:
|
||||||
|
# If this head is still to the right of us, we can skip to it
|
||||||
|
# No token that's between this token and this head could be our
|
||||||
|
# child.
|
||||||
|
if (ptr.head >= 1) and (ptr + ptr.head) < target:
|
||||||
|
ptr += ptr.head
|
||||||
|
elif ptr + ptr.head == target:
|
||||||
|
idx -= 1
|
||||||
|
if idx == 0:
|
||||||
|
return ptr
|
||||||
|
ptr += 1
|
||||||
else:
|
else:
|
||||||
|
ptr += 1
|
||||||
return NULL
|
return NULL
|
||||||
|
|
||||||
|
|
||||||
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil:
|
cdef const TokenC* get_right(const State* s, const TokenC* target, int idx) nogil:
|
||||||
cdef uint32_t kids = head.r_kids
|
if target.r_kids == 0:
|
||||||
if kids == 0:
|
|
||||||
return NULL
|
return NULL
|
||||||
cdef int offset = _nth_significant_bit(kids, idx)
|
if idx > target.r_kids:
|
||||||
cdef const TokenC* child = head + offset
|
return NULL
|
||||||
if child < (s.sent + s.sent_len):
|
if idx < 1:
|
||||||
return child
|
return NULL
|
||||||
|
cdef const TokenC* ptr = s.sent + (s.sent_len - 1)
|
||||||
|
while ptr > target:
|
||||||
|
# If this head is still to the right of us, we can skip to it
|
||||||
|
# No token that's between this token and this head could be our
|
||||||
|
# child.
|
||||||
|
if (ptr.head < 0) and ((ptr + ptr.head) > target):
|
||||||
|
ptr += ptr.head
|
||||||
|
elif ptr + ptr.head == target:
|
||||||
|
idx -= 1
|
||||||
|
if idx == 0:
|
||||||
|
return ptr
|
||||||
|
ptr -= 1
|
||||||
else:
|
else:
|
||||||
|
ptr -= 1
|
||||||
return NULL
|
return NULL
|
||||||
|
|
||||||
|
|
||||||
cdef int count_left_kids(const TokenC* head) nogil:
|
cdef int count_left_kids(const TokenC* head) nogil:
|
||||||
return _popcount(head.l_kids)
|
return head.l_kids
|
||||||
|
|
||||||
|
|
||||||
cdef int count_right_kids(const TokenC* head) nogil:
|
cdef int count_right_kids(const TokenC* head) nogil:
|
||||||
return _popcount(head.r_kids)
|
return head.r_kids
|
||||||
|
|
||||||
|
|
||||||
cdef State* new_state(Pool mem, const TokenC* sent, const int sent_len) except NULL:
|
cdef State* new_state(Pool mem, const TokenC* sent, const int sent_len) except NULL:
|
||||||
|
@ -185,23 +207,3 @@ cdef int copy_state(State* dest, const State* src) except -1:
|
||||||
for i in range(src.ents_len):
|
for i in range(src.ents_len):
|
||||||
dest.ent[-i] = src.ent[-i]
|
dest.ent[-i] = src.ent[-i]
|
||||||
dest.ents_len = src.ents_len
|
dest.ents_len = src.ents_len
|
||||||
|
|
||||||
|
|
||||||
# From https://en.wikipedia.org/wiki/Hamming_weight
|
|
||||||
cdef inline uint32_t _popcount(uint32_t x) nogil:
|
|
||||||
"""Find number of non-zero bits."""
|
|
||||||
cdef int count = 0
|
|
||||||
while x != 0:
|
|
||||||
x &= x - 1
|
|
||||||
count += 1
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
|
|
||||||
cdef int i
|
|
||||||
for i in range(32):
|
|
||||||
if bits & (1 << i):
|
|
||||||
n -= 1
|
|
||||||
if n < 1:
|
|
||||||
return i
|
|
||||||
return 0
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user