spaCy/spacy/syntax/_state.pyx

# cython: profile=True
from libc.string cimport memmove
from cymem.cymem cimport Pool

from ..lexeme cimport EMPTY_LEXEME


cdef int add_dep(State *s, TokenC* head, TokenC* child, int label) except -1:
    child.head = head - child
    child.dep_tag = label
    # Keep a bit-vector tracking child dependencies.  If a word has a child at
    # offset i from it, set that bit (tracking left and right separately)
    if child > head:
        head.r_kids |= 1 << child.head
    else:
        head.l_kids |= 1 << (-child.head)


cdef TokenC* pop_stack(State *s) except NULL:
    assert s.stack_len >= 1
    cdef TokenC* top = s.stack[0]
    s.stack -= 1
    s.stack_len -= 1
    return top


cdef int push_stack(State *s) except -1:
    assert s.i < s.sent_len
    s.stack += 1
    s.stack[0] = &s.sent[s.i]
    s.stack_len += 1
    s.i += 1


cdef int children_in_buffer(const State *s, const TokenC* target, list gold) except -1:
    # Golds holds an array of head offsets --- the head of word i is i - golds[i]
    # Iterate over the tokens of the queue, and check whether their gold head is
    # our target
    cdef int i
    cdef int n = 0
    cdef TokenC* buff_word
    cdef TokenC* buff_head
    cdef int buff_word_head_offset
    for i in range(s.i, s.sent_len):
        buff_word = &s.sent[i]
        buff_word_head_offset = gold[i]
        buff_head = buff_word + buff_word_head_offset
        if buff_head == target:
            n += 1
    return n


cdef int head_in_buffer(const State *s, const TokenC* target, list gold) except -1:
    cdef int target_idx = get_idx(s, target)
    cdef int target_head_idx = target_idx + gold[target_idx]
    return target_head_idx >= s.i


cdef int children_in_stack(const State *s, const TokenC* target, list gold) except -1:
    if s.stack_len == 0:
        return 0
    cdef int i
    cdef int n = 0
    cdef const TokenC* stack_word
    cdef const TokenC* stack_word_head
    cdef int stack_word_head_offset
    for i in range(s.stack_len):
        stack_word = (s.stack - i)[0]
        stack_word_head_offset = gold[get_idx(s, stack_word)]
        stack_word_head = (s.stack + stack_word_head_offset)[0]
        if stack_word_head == target:
            n += 1
    return n


cdef int head_in_stack(const State *s, const TokenC* target, list gold) except -1:
    if s.stack_len == 0:
        return 0
    cdef int head_offset = gold[get_idx(s, target)]
    cdef const TokenC* target_head = target + head_offset
    cdef int i
    for i in range(s.stack_len):
        if target_head == (s.stack - i)[0]:
            return 1
    return 0


cdef TokenC* get_left(State* s, TokenC* head, int idx) nogil:
    cdef uint32_t kids = head.l_kids
    if kids == 0:
        return s.sent - 1
    cdef int offset = _nth_significant_bit(kids, idx)
    cdef TokenC* child = head - offset
    if child >= s.sent:
        return child
    else:
        return s.sent - 1


cdef TokenC* get_right(State* s, TokenC* head, int idx) nogil:
    cdef uint32_t kids = head.r_kids
    if kids == 0:
        return s.sent - 1
    cdef int offset = _nth_significant_bit(kids, idx)
    cdef TokenC* child = head + offset
    if child < (s.sent + s.sent_len):
        return child
    else:
        return s.sent - 1


DEF PADDING = 5


cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL:
    cdef int padded_len = sent_length + PADDING + PADDING
    cdef State* s = <State*>mem.alloc(1, sizeof(State))
    s.stack = <TokenC**>mem.alloc(padded_len, sizeof(TokenC*))
    cdef TokenC* eol_token = sent - 1
    for i in range(PADDING):
        # sent should be padded, with a suitable sentinel token here
        s.stack[0] = eol_token
        s.stack += 1
    s.stack[0] = eol_token
    s.sent = sent
    s.stack_len = 0
    s.i = 0
    s.sent_len = sent_length
    return s
* Work on greedy parser 2014-12-16 14:44:43 +03:00			`# cython: profile=True`
			`from libc.string cimport memmove`
			`from cymem.cymem cimport Pool`

			`from ..lexeme cimport EMPTY_LEXEME`


			`cdef int add_dep(State s, TokenC head, TokenC* child, int label) except -1:`
			`child.head = head - child`
			`child.dep_tag = label`
			`# Keep a bit-vector tracking child dependencies. If a word has a child at`
			`# offset i from it, set that bit (tracking left and right separately)`
			`if child > head:`
			`head.r_kids \|= 1 << child.head`
			`else:`
			`head.l_kids \|= 1 << (-child.head)`


			`cdef TokenC* pop_stack(State *s) except NULL:`
			`assert s.stack_len >= 1`
			`cdef TokenC* top = s.stack[0]`
			`s.stack -= 1`
			`s.stack_len -= 1`
			`return top`


			`cdef int push_stack(State *s) except -1:`
			`assert s.i < s.sent_len`
			`s.stack += 1`
			`s.stack[0] = &s.sent[s.i]`
			`s.stack_len += 1`
			`s.i += 1`


			`cdef int children_in_buffer(const State s, const TokenC target, list gold) except -1:`
			`# Golds holds an array of head offsets --- the head of word i is i - golds[i]`
			`# Iterate over the tokens of the queue, and check whether their gold head is`
			`# our target`
			`cdef int i`
			`cdef int n = 0`
			`cdef TokenC* buff_word`
			`cdef TokenC* buff_head`
			`cdef int buff_word_head_offset`
			`for i in range(s.i, s.sent_len):`
			`buff_word = &s.sent[i]`
			`buff_word_head_offset = gold[i]`
			`buff_head = buff_word + buff_word_head_offset`
			`if buff_head == target:`
			`n += 1`
			`return n`


			`cdef int head_in_buffer(const State s, const TokenC target, list gold) except -1:`
			`cdef int target_idx = get_idx(s, target)`
			`cdef int target_head_idx = target_idx + gold[target_idx]`
			`return target_head_idx >= s.i`


			`cdef int children_in_stack(const State s, const TokenC target, list gold) except -1:`
			`if s.stack_len == 0:`
			`return 0`
			`cdef int i`
			`cdef int n = 0`
			`cdef const TokenC* stack_word`
			`cdef const TokenC* stack_word_head`
			`cdef int stack_word_head_offset`
			`for i in range(s.stack_len):`
			`stack_word = (s.stack - i)[0]`
			`stack_word_head_offset = gold[get_idx(s, stack_word)]`
			`stack_word_head = (s.stack + stack_word_head_offset)[0]`
			`if stack_word_head == target:`
			`n += 1`
			`return n`


			`cdef int head_in_stack(const State s, const TokenC target, list gold) except -1:`
			`if s.stack_len == 0:`
			`return 0`
			`cdef int head_offset = gold[get_idx(s, target)]`
			`cdef const TokenC* target_head = target + head_offset`
			`cdef int i`
			`for i in range(s.stack_len):`
			`if target_head == (s.stack - i)[0]:`
			`return 1`
			`return 0`


			`cdef TokenC* get_left(State* s, TokenC* head, int idx) nogil:`
			`cdef uint32_t kids = head.l_kids`
			`if kids == 0:`
			`return s.sent - 1`
			`cdef int offset = _nth_significant_bit(kids, idx)`
			`cdef TokenC* child = head - offset`
			`if child >= s.sent:`
			`return child`
			`else:`
			`return s.sent - 1`


			`cdef TokenC* get_right(State* s, TokenC* head, int idx) nogil:`
			`cdef uint32_t kids = head.r_kids`
			`if kids == 0:`
			`return s.sent - 1`
			`cdef int offset = _nth_significant_bit(kids, idx)`
			`cdef TokenC* child = head + offset`
			`if child < (s.sent + s.sent_len):`
			`return child`
			`else:`
			`return s.sent - 1`


			`DEF PADDING = 5`


			`cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL:`
			`cdef int padded_len = sent_length + PADDING + PADDING`
			`cdef State* s = <State*>mem.alloc(1, sizeof(State))`
			`s.stack = <TokenC*>mem.alloc(padded_len, sizeof(TokenC))`
			`cdef TokenC* eol_token = sent - 1`
			`for i in range(PADDING):`
			`# sent should be padded, with a suitable sentinel token here`
			`s.stack[0] = eol_token`
			`s.stack += 1`
			`s.stack[0] = eol_token`
			`s.sent = sent`
			`s.stack_len = 0`
			`s.i = 0`
			`s.sent_len = sent_length`
			`return s`