WIP on split parsing

2025-10-02 18:06:46 +03:00 · 2018-03-28 01:44:05 +02:00 · 2018-03-28 01:44:05 +02:00 · d399843576
commit d399843576
parent de9fd091ac
2 changed files with 21 additions and 1 deletions
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -319,6 +319,21 @@ cdef cppclass StateC:
        if this._b_i > this._break:
            this._break = -1

+    void split(int i, int n) nogil:
+        '''Split token i of the buffer into N pieces.'''
+        # Let's say we've got a length 10 sentence.
+        # state.split(5, 2)
+        # Before: [0, 1, 2, 3, 4, 5,   6,   7,   8, 9, 10]
+        # After:  [0, 1, 2, 3, 4, 5.0, 5.1, 5.2, 6, 7, 8, 9, 10]
+        # Sentence grows to length 12.
+        this.length += n
+        this._sent -= PADDING
+        this._sent = <TokenC*>realloc(this.length + (PADDING * 2), sizeof(TokenC))
+        this._sent += PADDING
+        # Words 6-10 move to positions 8-12
+        memmove(&this._sent[i+1], &this._sent[i+1+n], (this.length-i)+PADDING*sizeof(TokenC))
+        # Words 0-5 stay where they are.
+
    void pop() nogil:
        if this._s_i >= 1:
            this._s_i -= 1
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -122,6 +122,8 @@ cdef class Shift:

    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
+        if label != 0:
+            st.split(st.B(1), label)
        st.push()
        st.fast_forward()

@ -135,7 +137,10 @@ cdef class Shift:

    @staticmethod
    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
-        return 0
+        if gold.fused_tokens[s.B(1)] == label:
+            return 0
+        else:
+            return 1


 cdef class Reduce: