mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 02:16:32 +03:00
* Fix r/l and distance features.
This commit is contained in:
parent
d51a86478e
commit
77d7e79c7e
|
@ -85,7 +85,7 @@ cdef int fill_context(atom_t* context, State* state) except -1:
|
|||
fill_token(&context[E0w], get_e0(state))
|
||||
fill_token(&context[E1w], get_e1(state))
|
||||
if state.stack_len >= 1:
|
||||
context[dist] = min(state.stack[0] - state.i, 5)
|
||||
context[dist] = min(state.i - state.stack[0], 5)
|
||||
else:
|
||||
context[dist] = 0
|
||||
context[N0lv] = min(count_left_kids(get_n0(state)), 5)
|
||||
|
|
|
@ -89,9 +89,9 @@ cdef inline TokenC* get_s2(const State *s) nogil:
|
|||
# Rely on our padding to ensure we don't go out of bounds here
|
||||
return &s.sent[s.stack[-2]]
|
||||
|
||||
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil
|
||||
cdef const TokenC* get_right(const State* s, const TokenC* head, int idx) nogil
|
||||
|
||||
cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil
|
||||
cdef const TokenC* get_left(const State* s, const TokenC* head, int idx) nogil
|
||||
|
||||
cdef inline bint at_eol(const State *s) nogil:
|
||||
return s.i >= s.sent_len
|
||||
|
|
|
@ -16,10 +16,8 @@ cdef int add_dep(State *s, int head, int child, int label) except -1:
|
|||
cdef int dist = head - child
|
||||
s.sent[child].head = dist
|
||||
s.sent[child].dep = label
|
||||
# Keep a bit-vector tracking child dependencies. If a word has a child at
|
||||
# offset i from it, set that bit (tracking left and right separately)
|
||||
if child > head:
|
||||
s.sent[head].r_kids |= 1 << (-dist)
|
||||
s.sent[head].r_kids += 1
|
||||
s.sent[head].r_edge = child - head
|
||||
# Walk up the tree, setting right edge
|
||||
n_iter = 0
|
||||
|
@ -34,7 +32,7 @@ cdef int add_dep(State *s, int head, int child, int label) except -1:
|
|||
msg = msg % (start, child, tree)
|
||||
raise Exception(msg)
|
||||
else:
|
||||
s.sent[head].l_kids |= 1 << dist
|
||||
s.sent[head].l_kids += 1
|
||||
s.sent[head].l_edge = (child + s.sent[child].l_edge) - head
|
||||
|
||||
|
||||
|
@ -42,14 +40,14 @@ cdef int del_dep(State *s, int head, int child) except -1:
|
|||
cdef const TokenC* next_child
|
||||
cdef int dist = head - child
|
||||
if child > head:
|
||||
s.sent[head].r_kids &= ~(1 << (-dist))
|
||||
s.sent[head].r_kids -= 1
|
||||
next_child = get_right(s, &s.sent[head], 1)
|
||||
if next_child == NULL:
|
||||
s.sent[head].r_edge = 0
|
||||
else:
|
||||
s.sent[head].r_edge = next_child.r_edge
|
||||
else:
|
||||
s.sent[head].l_kids &= ~(1 << dist)
|
||||
s.sent[head].l_kids -= 1
|
||||
next_child = get_left(s, &s.sent[head], 1)
|
||||
if next_child == NULL:
|
||||
s.sent[head].l_edge = 0
|
||||
|
@ -113,36 +111,60 @@ cdef bint has_head(const TokenC* t) nogil:
|
|||
return t.head != 0
|
||||
|
||||
|
||||
cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil:
|
||||
cdef uint32_t kids = head.l_kids
|
||||
if kids == 0:
|
||||
cdef const TokenC* get_left(const State* s, const TokenC* target, int idx) nogil:
|
||||
if target.l_kids == 0:
|
||||
return NULL
|
||||
cdef int offset = _nth_significant_bit(kids, idx)
|
||||
cdef const TokenC* child = head - offset
|
||||
if child >= s.sent:
|
||||
return child
|
||||
if idx > target.l_kids:
|
||||
return NULL
|
||||
if idx < 1:
|
||||
return NULL
|
||||
cdef const TokenC* ptr = s.sent
|
||||
while ptr < target:
|
||||
# If this head is still to the right of us, we can skip to it
|
||||
# No token that's between this token and this head could be our
|
||||
# child.
|
||||
if (ptr.head >= 1) and (ptr + ptr.head) < target:
|
||||
ptr += ptr.head
|
||||
elif ptr + ptr.head == target:
|
||||
idx -= 1
|
||||
if idx == 0:
|
||||
return ptr
|
||||
ptr += 1
|
||||
else:
|
||||
ptr += 1
|
||||
return NULL
|
||||
|
||||
|
||||
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil:
|
||||
cdef uint32_t kids = head.r_kids
|
||||
if kids == 0:
|
||||
cdef const TokenC* get_right(const State* s, const TokenC* target, int idx) nogil:
|
||||
if target.r_kids == 0:
|
||||
return NULL
|
||||
cdef int offset = _nth_significant_bit(kids, idx)
|
||||
cdef const TokenC* child = head + offset
|
||||
if child < (s.sent + s.sent_len):
|
||||
return child
|
||||
if idx > target.r_kids:
|
||||
return NULL
|
||||
if idx < 1:
|
||||
return NULL
|
||||
cdef const TokenC* ptr = s.sent + (s.sent_len - 1)
|
||||
while ptr > target:
|
||||
# If this head is still to the right of us, we can skip to it
|
||||
# No token that's between this token and this head could be our
|
||||
# child.
|
||||
if (ptr.head < 0) and ((ptr + ptr.head) > target):
|
||||
ptr += ptr.head
|
||||
elif ptr + ptr.head == target:
|
||||
idx -= 1
|
||||
if idx == 0:
|
||||
return ptr
|
||||
ptr -= 1
|
||||
else:
|
||||
ptr -= 1
|
||||
return NULL
|
||||
|
||||
|
||||
cdef int count_left_kids(const TokenC* head) nogil:
|
||||
return _popcount(head.l_kids)
|
||||
return head.l_kids
|
||||
|
||||
|
||||
cdef int count_right_kids(const TokenC* head) nogil:
|
||||
return _popcount(head.r_kids)
|
||||
return head.r_kids
|
||||
|
||||
|
||||
cdef State* new_state(Pool mem, const TokenC* sent, const int sent_len) except NULL:
|
||||
|
@ -185,23 +207,3 @@ cdef int copy_state(State* dest, const State* src) except -1:
|
|||
for i in range(src.ents_len):
|
||||
dest.ent[-i] = src.ent[-i]
|
||||
dest.ents_len = src.ents_len
|
||||
|
||||
|
||||
# From https://en.wikipedia.org/wiki/Hamming_weight
|
||||
cdef inline uint32_t _popcount(uint32_t x) nogil:
|
||||
"""Find number of non-zero bits."""
|
||||
cdef int count = 0
|
||||
while x != 0:
|
||||
x &= x - 1
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
|
||||
cdef int i
|
||||
for i in range(32):
|
||||
if bits & (1 << i):
|
||||
n -= 1
|
||||
if n < 1:
|
||||
return i
|
||||
return 0
|
||||
|
|
Loading…
Reference in New Issue
Block a user