spaCy/spacy/syntax/_state.pxd

from libc.string cimport memcpy, memset
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint32_t, uint64_t

from murmurhash.mrmr cimport hash64

from ..vocab cimport EMPTY_LEXEME
from ..structs cimport TokenC, Entity
from ..lexeme cimport Lexeme
from ..symbols cimport punct
from ..attrs cimport IS_SPACE


cdef inline bint is_space_token(const TokenC* token) nogil:
    return Lexeme.c_check_flag(token.lex, IS_SPACE)


cdef cppclass StateC:
    int* _stack
    int* _buffer
    bint* shifted
    TokenC* _sent
    Entity* _ents
    TokenC _empty_token
    int length
    int _s_i
    int _b_i
    int _e_i
    int _break

    __init__(const TokenC* sent, int length) nogil:
        cdef int PADDING = 5
        this._buffer = <int*>calloc(length + (PADDING * 2), sizeof(int))
        this._stack = <int*>calloc(length + (PADDING * 2), sizeof(int))
        this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
        this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
        this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
        cdef int i
        for i in range(length + (PADDING * 2)):
            this._ents[i].end = -1
            this._sent[i].l_edge = i
            this._sent[i].r_edge = i
        for i in range(PADDING):
            this._sent[i].lex = &EMPTY_LEXEME
        for i in range(length, PADDING):
            this._sent[i].lex = &EMPTY_LEXEME
        this._sent += PADDING
        this._ents += PADDING
        this._buffer += PADDING
        this._stack += PADDING
        this.shifted += PADDING
        this.length = length
        this._break = -1
        this._s_i = 0
        this._b_i = 0
        this._e_i = 0
        for i in range(length):
            this._buffer[i] = i
        memset(&this._empty_token, 0, sizeof(TokenC))
        this._empty_token.lex = &EMPTY_LEXEME
        for i in range(length):
            this._sent[i] = sent[i]
            this._buffer[i] = i

    __dealloc__():
        cdef int PADDING = 5
        free(this._sent - PADDING)
        free(this._ents - PADDING)
        free(this._buffer - PADDING)
        free(this._stack - PADDING)
        free(this.shifted - PADDING)

    int S(int i) nogil const:
        if i >= this._s_i:
            return -1
        return this._stack[this._s_i - (i+1)]

    int B(int i) nogil const:
        if (i + this._b_i) >= this.length:
            return -1
        return this._buffer[this._b_i + i]

    const TokenC* S_(int i) nogil const:
        return this.safe_get(this.S(i))

    const TokenC* B_(int i) nogil const:
        return this.safe_get(this.B(i))

    const TokenC* H_(int i) nogil const:
        return this.safe_get(this.H(i))

    const TokenC* E_(int i) nogil const:
        return this.safe_get(this.E(i))

    const TokenC* L_(int i, int idx) nogil const:
        return this.safe_get(this.L(i, idx))

    const TokenC* R_(int i, int idx) nogil const:
        return this.safe_get(this.R(i, idx))

    const TokenC* safe_get(int i) nogil const:
        if i < 0 or i >= this.length:
            return &this._empty_token
        else:
            return &this._sent[i]

    int H(int i) nogil const:
        if i < 0 or i >= this.length:
            return -1
        return this._sent[i].head + i

    int E(int i) nogil const:
        if this._e_i <= 0 or this._e_i >= this.length:
            return 0
        if i < 0 or i >= this._e_i:
            return 0
        return this._ents[this._e_i - (i+1)].start

    int L(int i, int idx) nogil const:
        if idx < 1:
            return -1
        if i < 0 or i >= this.length:
            return -1
        cdef const TokenC* target = &this._sent[i]
        if target.l_kids < <uint32_t>idx:
            return -1
        cdef const TokenC* ptr = &this._sent[target.l_edge]

        while ptr < target:
            # If this head is still to the right of us, we can skip to it
            # No token that's between this token and this head could be our
            # child.
            if (ptr.head >= 1) and (ptr + ptr.head) < target:
                ptr += ptr.head

            elif ptr + ptr.head == target:
                idx -= 1
                if idx == 0:
                    return ptr - this._sent
                ptr += 1
            else:
                ptr += 1
        return -1

    int R(int i, int idx) nogil const:
        if idx < 1:
            return -1
        if i < 0 or i >= this.length:
            return -1
        cdef const TokenC* target = &this._sent[i]
        if target.r_kids < <uint32_t>idx:
            return -1
        cdef const TokenC* ptr = &this._sent[target.r_edge]
        while ptr > target:
            # If this head is still to the right of us, we can skip to it
            # No token that's between this token and this head could be our
            # child.
            if (ptr.head < 0) and ((ptr + ptr.head) > target):
                ptr += ptr.head
            elif ptr + ptr.head == target:
                idx -= 1
                if idx == 0:
                    return ptr - this._sent
                ptr -= 1
            else:
                ptr -= 1
        return -1

    bint empty() nogil const:
        return this._s_i <= 0

    bint eol() nogil const:
        return this.buffer_length() == 0

    bint at_break() nogil const:
        return this._break != -1

    bint is_final() nogil const:
        return this.stack_depth() <= 0 and this._b_i >= this.length

    bint has_head(int i) nogil const:
        return this.safe_get(i).head != 0

    int n_L(int i) nogil const:
        return this.safe_get(i).l_kids

    int n_R(int i) nogil const:
        return this.safe_get(i).r_kids

    bint stack_is_connected() nogil const:
        return False

    bint entity_is_open() nogil const:
        if this._e_i < 1:
            return False
        return this._ents[this._e_i-1].end == -1

    int stack_depth() nogil const:
        return this._s_i

    int buffer_length() nogil const:
        if this._break != -1:
            return this._break - this._b_i
        else:
            return this.length - this._b_i

    uint64_t hash() nogil const:
        cdef TokenC[11] sig
        sig[0] = this.S_(2)[0]
        sig[1] = this.S_(1)[0]
        sig[2] = this.R_(this.S(1), 1)[0]
        sig[3] = this.L_(this.S(0), 1)[0]
        sig[4] = this.L_(this.S(0), 2)[0]
        sig[5] = this.S_(0)[0]
        sig[6] = this.R_(this.S(0), 2)[0]
        sig[7] = this.R_(this.S(0), 1)[0]
        sig[8] = this.B_(0)[0]
        sig[9] = this.E_(0)[0]
        sig[10] = this.E_(1)[0]
        return hash64(sig, sizeof(sig), this._s_i)

    void push() nogil:
        if this.B(0) != -1:
            this._stack[this._s_i] = this.B(0)
        this._s_i += 1
        this._b_i += 1
        if this._b_i > this._break:
            this._break = -1

    void pop() nogil:
        if this._s_i >= 1:
            this._s_i -= 1

    void unshift() nogil:
        this._b_i -= 1
        this._buffer[this._b_i] = this.S(0)
        this._s_i -= 1
        this.shifted[this.B(0)] = True

    void add_arc(int head, int child, int label) nogil:
        if this.has_head(child):
            this.del_arc(this.H(child), child)

        cdef int dist = head - child
        this._sent[child].head = dist
        this._sent[child].dep = label
        cdef int i
        if child > head:
            this._sent[head].r_kids += 1
            # Some transition systems can have a word in the buffer have a
            # rightward child, e.g. from Unshift.
            this._sent[head].r_edge = this._sent[child].r_edge
            i = 0
            while this.has_head(head) and i < this.length:
                head = this.H(head)
                this._sent[head].r_edge = this._sent[child].r_edge
                i += 1 # Guard against infinite loops
        else:
            this._sent[head].l_kids += 1
            this._sent[head].l_edge = this._sent[child].l_edge

    void del_arc(int h_i, int c_i) nogil:
        cdef int dist = h_i - c_i
        cdef TokenC* h = &this._sent[h_i]
        cdef int i = 0
        if c_i > h_i:
            # this.R_(h_i, 2) returns the second-rightmost child token of h_i
            # If we have more than 2 rightmost children, our 2nd rightmost child's
            # rightmost edge is going to be our new rightmost edge.
            h.r_edge = this.R_(h_i, 2).r_edge if h.r_kids >= 2 else h_i
            h.r_kids -= 1
            new_edge = h.r_edge
            # Correct upwards in the tree --- see Issue #251
            while h.head < 0 and i < this.length: # Guard infinite loop
                h += h.head
                h.r_edge = new_edge
                i += 1
        else:
            # Same logic applies for left edge, but we don't need to walk up
            # the tree, as the head is off the stack.
            h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
            h.l_kids -= 1

    void open_ent(int label) nogil:
        this._ents[this._e_i].start = this.B(0)
        this._ents[this._e_i].label = label
        this._ents[this._e_i].end = -1
        this._e_i += 1

    void close_ent() nogil:
        # Note that we don't decrement _e_i here! We want to maintain all
        # entities, not over-write them...
        this._ents[this._e_i-1].end = this.B(0)+1
        this._sent[this.B(0)].ent_iob = 1

    void set_ent_tag(int i, int ent_iob, int ent_type) nogil:
        if 0 <= i < this.length:
            this._sent[i].ent_iob = ent_iob
            this._sent[i].ent_type = ent_type

    void set_break(int i) nogil:
        if 0 <= i < this.length:
            this._sent[i].sent_start = True
            this._break = this._b_i

    void clone(const StateC* src) nogil:
        memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
        memcpy(this._stack, src._stack, this.length * sizeof(int))
        memcpy(this._buffer, src._buffer, this.length * sizeof(int))
        memcpy(this._ents, src._ents, this.length * sizeof(Entity))
        memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
        this.length = src.length
        this._b_i = src._b_i
        this._s_i = src._s_i
        this._e_i = src._e_i
        this._break = src._break

    void fast_forward() nogil:
        # space token attachement policy:
        # - attach space tokens always to the last preceding real token
        # - except if it's the beginning of a sentence, then attach to the first following
        # - boundary case: a document containing multiple space tokens but nothing else,
        #   then make the last space token the head of all others

        while is_space_token(this.B_(0)) \
        or this.buffer_length() == 0 \
        or this.stack_depth() == 0:
            if this.buffer_length() == 0:
                # remove the last sentence's root from the stack
                if this.stack_depth() == 1:
                    this.pop()
                # parser got stuck: reduce stack or unshift
                elif this.stack_depth() > 1:
                    if this.has_head(this.S(0)):
                        this.pop()
                    else:
                        this.unshift()
                # stack is empty but there is another sentence on the buffer
                elif (this.length - this._b_i) >= 1:
                    this.push()
                else: # stack empty and nothing else coming
                    break

            elif is_space_token(this.B_(0)):
                # the normal case: we're somewhere inside a sentence
                if this.stack_depth() > 0:
                    # assert not is_space_token(this.S_(0))
                    # attach all coming space tokens to their last preceding
                    # real token (which should be on the top of the stack)
                    while is_space_token(this.B_(0)):
                        this.add_arc(this.S(0),this.B(0),0)
                        this.push()
                        this.pop()
                # the rare case: we're at the beginning of a document:
                # space tokens are attached to the first real token on the buffer
                elif this.stack_depth() == 0:
                    # store all space tokens on the stack until a real token shows up
                    # or the last token on the buffer is reached
                    while is_space_token(this.B_(0)) and this.buffer_length() > 1:
                        this.push()
                    # empty the stack by attaching all space tokens to the
                    # first token on the buffer
                    # boundary case: if all tokens are space tokens, the last one
                    # becomes the head of all others
                    while this.stack_depth() > 0:
                        this.add_arc(this.B(0),this.S(0),0)
                        this.pop()
                    # move the first token onto the stack
                    this.push()

            elif this.stack_depth() == 0:
                # for one token sentences (?)
                if this.buffer_length() == 1:
                    this.push()
                    this.pop()
                # with an empty stack and a non-empty buffer
                # only shift is valid anyway
                elif (this.length - this._b_i) >= 1:
                    this.push()

            else: # can this even happen?
                break
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`from libc.string cimport memcpy, memset`
			`from libc.stdlib cimport malloc, calloc, free`
WIP on beam parser. Currently segfaults. 2017-03-11 15:19:52 +03:00			`from libc.stdint cimport uint32_t, uint64_t`

			`from murmurhash.mrmr cimport hash64`

* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`from ..vocab cimport EMPTY_LEXEME`
			`from ..structs cimport TokenC, Entity`
			`from ..lexeme cimport Lexeme`
			`from ..symbols cimport punct`
			`from ..attrs cimport IS_SPACE`


different handling of space tokens space tokens are now always attached to the previous non-space token there are two exceptions: leading space tokens are attached to the first following non-space token in input that consists exclusively of space tokens, the last space token is the head of all others. 2016-04-13 16:28:28 +03:00			`cdef inline bint is_space_token(const TokenC* token) nogil:`
			`return Lexeme.c_check_flag(token.lex, IS_SPACE)`


* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`cdef cppclass StateC:`
			`int* _stack`
			`int* _buffer`
			`bint* shifted`
			`TokenC* _sent`
			`Entity* _ents`
			`TokenC _empty_token`
			`int length`
			`int _s_i`
			`int _b_i`
			`int _e_i`
			`int _break`

			`__init__(const TokenC* sent, int length) nogil:`
			`cdef int PADDING = 5`
			`this._buffer = <int>calloc(length + (PADDING 2), sizeof(int))`
			`this._stack = <int>calloc(length + (PADDING 2), sizeof(int))`
			`this.shifted = <bint>calloc(length + (PADDING 2), sizeof(bint))`
			`this._sent = <TokenC>calloc(length + (PADDING 2), sizeof(TokenC))`
			`this._ents = <Entity>calloc(length + (PADDING 2), sizeof(Entity))`
			`cdef int i`
			`for i in range(length + (PADDING * 2)):`
			`this._ents[i].end = -1`
			`this._sent[i].l_edge = i`
			`this._sent[i].r_edge = i`
I think this fixes the segfault 2017-03-11 15:58:34 +03:00			`for i in range(PADDING):`
			`this._sent[i].lex = &EMPTY_LEXEME`
			`for i in range(length, PADDING):`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`this._sent[i].lex = &EMPTY_LEXEME`
			`this._sent += PADDING`
			`this._ents += PADDING`
			`this._buffer += PADDING`
			`this._stack += PADDING`
			`this.shifted += PADDING`
			`this.length = length`
			`this._break = -1`
			`this._s_i = 0`
			`this._b_i = 0`
			`this._e_i = 0`
			`for i in range(length):`
			`this._buffer[i] = i`
* Initialize StateC._empty_token to 0, to avoid undefined behaviour. 2016-02-06 15:34:38 +03:00			`memset(&this._empty_token, 0, sizeof(TokenC))`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`this._empty_token.lex = &EMPTY_LEXEME`
			`for i in range(length):`
			`this._sent[i] = sent[i]`
			`this._buffer[i] = i`

			`__dealloc__():`
* Continue proxying. Some problem currently 2016-02-01 04:22:21 +03:00			`cdef int PADDING = 5`
			`free(this._sent - PADDING)`
			`free(this._ents - PADDING)`
			`free(this._buffer - PADDING)`
			`free(this._stack - PADDING)`
			`free(this.shifted - PADDING)`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00
* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`int S(int i) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`if i >= this._s_i:`
			`return -1`
			`return this._stack[this._s_i - (i+1)]`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`int B(int i) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`if (i + this._b_i) >= this.length:`
			`return -1`
			`return this._buffer[this._b_i + i]`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`const TokenC* S_(int i) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this.safe_get(this.S(i))`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`const TokenC* B_(int i) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this.safe_get(this.B(i))`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`const TokenC* H_(int i) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this.safe_get(this.H(i))`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`const TokenC* E_(int i) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this.safe_get(this.E(i))`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`const TokenC* L_(int i, int idx) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this.safe_get(this.L(i, idx))`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`const TokenC* R_(int i, int idx) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this.safe_get(this.R(i, idx))`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`const TokenC* safe_get(int i) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`if i < 0 or i >= this.length:`
			`return &this._empty_token`
			`else:`
			`return &this._sent[i]`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`int H(int i) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`if i < 0 or i >= this.length:`
			`return -1`
			`return this._sent[i].head + i`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`int E(int i) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`if this._e_i <= 0 or this._e_i >= this.length:`
			`return 0`
			`if i < 0 or i >= this._e_i:`
			`return 0`
			`return this._ents[this._e_i - (i+1)].start`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`int L(int i, int idx) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`if idx < 1:`
			`return -1`
			`if i < 0 or i >= this.length:`
			`return -1`
			`cdef const TokenC* target = &this._sent[i]`
* Fix compiler warning about signed/unsigned comparison 2016-02-01 11:08:07 +03:00			`if target.l_kids < <uint32_t>idx:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return -1`
			`cdef const TokenC* ptr = &this._sent[target.l_edge]`

			`while ptr < target:`
			`# If this head is still to the right of us, we can skip to it`
			`# No token that's between this token and this head could be our`
			`# child.`
			`if (ptr.head >= 1) and (ptr + ptr.head) < target:`
			`ptr += ptr.head`

			`elif ptr + ptr.head == target:`
			`idx -= 1`
			`if idx == 0:`
			`return ptr - this._sent`
			`ptr += 1`
			`else:`
			`ptr += 1`
			`return -1`
WIP on beam parser. Currently segfaults. 2017-03-11 15:19:52 +03:00
* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`int R(int i, int idx) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`if idx < 1:`
			`return -1`
			`if i < 0 or i >= this.length:`
			`return -1`
			`cdef const TokenC* target = &this._sent[i]`
* Fix compiler warning about signed/unsigned comparison 2016-02-01 11:08:07 +03:00			`if target.r_kids < <uint32_t>idx:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return -1`
			`cdef const TokenC* ptr = &this._sent[target.r_edge]`
			`while ptr > target:`
			`# If this head is still to the right of us, we can skip to it`
			`# No token that's between this token and this head could be our`
			`# child.`
			`if (ptr.head < 0) and ((ptr + ptr.head) > target):`
			`ptr += ptr.head`
			`elif ptr + ptr.head == target:`
			`idx -= 1`
			`if idx == 0:`
			`return ptr - this._sent`
			`ptr -= 1`
			`else:`
			`ptr -= 1`
			`return -1`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`bint empty() nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this._s_i <= 0`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`bint eol() nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this.buffer_length() == 0`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`bint at_break() nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this._break != -1`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`bint is_final() nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this.stack_depth() <= 0 and this._b_i >= this.length`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`bint has_head(int i) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this.safe_get(i).head != 0`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`int n_L(int i) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this.safe_get(i).l_kids`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`int n_R(int i) nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this.safe_get(i).r_kids`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`bint stack_is_connected() nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return False`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`bint entity_is_open() nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`if this._e_i < 1:`
			`return False`
			`return this._ents[this._e_i-1].end == -1`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`int stack_depth() nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`return this._s_i`

* Pass a StateC pointer into the transition and validation methods in the parser, so that the GIL can be released over a batch of documents 2016-02-01 04:58:14 +03:00			`int buffer_length() nogil const:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`if this._break != -1:`
			`return this._break - this._b_i`
			`else:`
			`return this.length - this._b_i`

WIP on beam parser. Currently segfaults. 2017-03-11 15:19:52 +03:00			`uint64_t hash() nogil const:`
			`cdef TokenC[11] sig`
			`sig[0] = this.S_(2)[0]`
			`sig[1] = this.S_(1)[0]`
			`sig[2] = this.R_(this.S(1), 1)[0]`
			`sig[3] = this.L_(this.S(0), 1)[0]`
			`sig[4] = this.L_(this.S(0), 2)[0]`
			`sig[5] = this.S_(0)[0]`
			`sig[6] = this.R_(this.S(0), 2)[0]`
			`sig[7] = this.R_(this.S(0), 1)[0]`
			`sig[8] = this.B_(0)[0]`
			`sig[9] = this.E_(0)[0]`
			`sig[10] = this.E_(1)[0]`
			`return hash64(sig, sizeof(sig), this._s_i)`

* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`void push() nogil:`
			`if this.B(0) != -1:`
			`this._stack[this._s_i] = this.B(0)`
			`this._s_i += 1`
			`this._b_i += 1`
			`if this._b_i > this._break:`
			`this._break = -1`

			`void pop() nogil:`
			`if this._s_i >= 1:`
			`this._s_i -= 1`
WIP on beam parser. Currently segfaults. 2017-03-11 15:19:52 +03:00
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`void unshift() nogil:`
			`this._b_i -= 1`
			`this._buffer[this._b_i] = this.S(0)`
			`this._s_i -= 1`
			`this.shifted[this.B(0)] = True`

			`void add_arc(int head, int child, int label) nogil:`
			`if this.has_head(child):`
			`this.del_arc(this.H(child), child)`

			`cdef int dist = head - child`
			`this._sent[child].head = dist`
			`this._sent[child].dep = label`
			`cdef int i`
			`if child > head:`
			`this._sent[head].r_kids += 1`
			`# Some transition systems can have a word in the buffer have a`
			`# rightward child, e.g. from Unshift.`
			`this._sent[head].r_edge = this._sent[child].r_edge`
			`i = 0`
			`while this.has_head(head) and i < this.length:`
			`head = this.H(head)`
			`this._sent[head].r_edge = this._sent[child].r_edge`
			`i += 1 # Guard against infinite loops`
			`else:`
			`this._sent[head].l_kids += 1`
			`this._sent[head].l_edge = this._sent[child].l_edge`

			`void del_arc(int h_i, int c_i) nogil:`
			`cdef int dist = h_i - c_i`
			`cdef TokenC* h = &this._sent[h_i]`
* Fix Issue #251: Incorrect right edge calculation on left-clobber low in the tree 2016-02-07 02:00:42 +03:00			`cdef int i = 0`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`if c_i > h_i:`
* Fix Issue #251: Incorrect right edge calculation on left-clobber low in the tree 2016-02-07 02:00:42 +03:00			`# this.R_(h_i, 2) returns the second-rightmost child token of h_i`
			`# If we have more than 2 rightmost children, our 2nd rightmost child's`
			`# rightmost edge is going to be our new rightmost edge.`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`h.r_edge = this.R_(h_i, 2).r_edge if h.r_kids >= 2 else h_i`
			`h.r_kids -= 1`
* Fix Issue #251: Incorrect right edge calculation on left-clobber low in the tree 2016-02-07 02:00:42 +03:00			`new_edge = h.r_edge`
			`# Correct upwards in the tree --- see Issue #251`
			`while h.head < 0 and i < this.length: # Guard infinite loop`
			`h += h.head`
			`h.r_edge = new_edge`
			`i += 1`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`else:`
* Fix Issue #251: Incorrect right edge calculation on left-clobber low in the tree 2016-02-07 02:00:42 +03:00			`# Same logic applies for left edge, but we don't need to walk up`
			`# the tree, as the head is off the stack.`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i`
			`h.l_kids -= 1`

			`void open_ent(int label) nogil:`
			`this._ents[this._e_i].start = this.B(0)`
			`this._ents[this._e_i].label = label`
			`this._ents[this._e_i].end = -1`
			`this._e_i += 1`

			`void close_ent() nogil:`
			`# Note that we don't decrement _e_i here! We want to maintain all`
			`# entities, not over-write them...`
			`this._ents[this._e_i-1].end = this.B(0)+1`
			`this._sent[this.B(0)].ent_iob = 1`

			`void set_ent_tag(int i, int ent_iob, int ent_type) nogil:`
			`if 0 <= i < this.length:`
			`this._sent[i].ent_iob = ent_iob`
			`this._sent[i].ent_type = ent_type`

			`void set_break(int i) nogil:`
WIP on beam parser. Currently segfaults. 2017-03-11 15:19:52 +03:00			`if 0 <= i < this.length:`
fix bug in StateC.set_break() 2016-05-05 16:15:34 +03:00			`this._sent[i].sent_start = True`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`this._break = this._b_i`

			`void clone(const StateC* src) nogil:`
			`memcpy(this._sent, src._sent, this.length * sizeof(TokenC))`
			`memcpy(this._stack, src._stack, this.length * sizeof(int))`
			`memcpy(this._buffer, src._buffer, this.length * sizeof(int))`
			`memcpy(this._ents, src._ents, this.length * sizeof(Entity))`
WIP on beam parser. Currently segfaults. 2017-03-11 15:19:52 +03:00			`memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))`
			`this.length = src.length`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`this._b_i = src._b_i`
			`this._s_i = src._s_i`
			`this._e_i = src._e_i`
			`this._break = src._break`

			`void fast_forward() nogil:`
different handling of space tokens space tokens are now always attached to the previous non-space token there are two exceptions: leading space tokens are attached to the first following non-space token in input that consists exclusively of space tokens, the last space token is the head of all others. 2016-04-13 16:28:28 +03:00			`# space token attachement policy:`
			`# - attach space tokens always to the last preceding real token`
			`# - except if it's the beginning of a sentence, then attach to the first following`
			`# - boundary case: a document containing multiple space tokens but nothing else,`
			`# then make the last space token the head of all others`

			`while is_space_token(this.B_(0)) \`
			`or this.buffer_length() == 0 \`
			`or this.stack_depth() == 0:`
			`if this.buffer_length() == 0:`
			`# remove the last sentence's root from the stack`
			`if this.stack_depth() == 1:`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`this.pop()`
different handling of space tokens space tokens are now always attached to the previous non-space token there are two exceptions: leading space tokens are attached to the first following non-space token in input that consists exclusively of space tokens, the last space token is the head of all others. 2016-04-13 16:28:28 +03:00			`# parser got stuck: reduce stack or unshift`
			`elif this.stack_depth() > 1:`
			`if this.has_head(this.S(0)):`
			`this.pop()`
			`else:`
			`this.unshift()`
			`# stack is empty but there is another sentence on the buffer`
			`elif (this.length - this._b_i) >= 1:`
			`this.push()`
			`else: # stack empty and nothing else coming`
			`break`

			`elif is_space_token(this.B_(0)):`
			`# the normal case: we're somewhere inside a sentence`
			`if this.stack_depth() > 0:`
			`# assert not is_space_token(this.S_(0))`
			`# attach all coming space tokens to their last preceding`
			`# real token (which should be on the top of the stack)`
			`while is_space_token(this.B_(0)):`
			`this.add_arc(this.S(0),this.B(0),0)`
			`this.push()`
			`this.pop()`
			`# the rare case: we're at the beginning of a document:`
			`# space tokens are attached to the first real token on the buffer`
			`elif this.stack_depth() == 0:`
			`# store all space tokens on the stack until a real token shows up`
			`# or the last token on the buffer is reached`
			`while is_space_token(this.B_(0)) and this.buffer_length() > 1:`
			`this.push()`
			`# empty the stack by attaching all space tokens to the`
			`# first token on the buffer`
			`# boundary case: if all tokens are space tokens, the last one`
			`# becomes the head of all others`
			`while this.stack_depth() > 0:`
			`this.add_arc(this.B(0),this.S(0),0)`
			`this.pop()`
			`# move the first token onto the stack`
			`this.push()`

			`elif this.stack_depth() == 0:`
			`# for one token sentences (?)`
			`if this.buffer_length() == 1:`
			`this.push()`
			`this.pop()`
			`# with an empty stack and a non-empty buffer`
			`# only shift is valid anyway`
			`elif (this.length - this._b_i) >= 1:`
			`this.push()`

			`else: # can this even happen?`
* Add _state file, which StateClass will proxy to 2016-02-01 03:09:21 +03:00			`break`