add function for setting head and label to token

change PseudoProjectivity.deprojectivize to use these functions
2026-01-09 10:11:24 +03:00 · 2016-03-11 17:31:06 +01:00 · 2016-03-11 17:31:06 +01:00 · 46e3f979f1
commit 46e3f979f1
parent 1508528c8c
4 changed files with 192 additions and 9 deletions
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -118,15 +118,18 @@ class PseudoProjectivity:
        # reattach arcs with decorated labels (following HEAD scheme)
        # for each decorated arc X||Y, search top-down, left-to-right,
        # breadth-first until hitting a Y then make this the new head
-        parse = tokens.to_array([HEAD, DEP])
-        labels = [ tokens.vocab.strings[int(p[1])] for p in parse ]
+        #parse = tokens.to_array([HEAD, DEP])
        for token in tokens:
            if cls.is_decorated(token.dep_):
                newlabel,headlabel = cls.decompose(token.dep_)
                newhead = cls._find_new_head(token,headlabel)
-                parse[token.i,1] = tokens.vocab.strings[newlabel]
-                parse[token.i,0] = newhead.i - token.i
-        tokens.from_array([HEAD, DEP],parse)
+                token.head = newhead
+                token.dep_ = newlabel
+
+                # tokens.attach(token,newhead,newlabel)
+                #parse[token.i,1] = tokens.vocab.strings[newlabel]
+                #parse[token.i,0] = newhead.i - token.i
+        #tokens.from_array([HEAD, DEP],parse)


    @classmethod
@ -168,7 +171,7 @@ class PseudoProjectivity:

    @classmethod
    def _find_new_head(cls, token, headlabel):
-        # search through the tree starting from root
+        # search through the tree starting from the head of the given token
        # returns the id of the first descendant with the given label
        # if there is none, return the current head (no change)
        queue = [token.head]
@ -176,8 +179,8 @@ class PseudoProjectivity:
            next_queue = []
            for qtoken in queue:
                for child in qtoken.children:
-                    if child == token:
-                        continue
+                    if child.is_space: continue                        
+                    if child == token: continue
                    if child.dep_ == headlabel:
                        return child
                    next_queue.append(child)
--- a/spacy/tests/tokens/test_token_api.py
+++ b/spacy/tests/tokens/test_token_api.py
@ -62,3 +62,67 @@ def test_vectors(EN):
    assert sum(apples.vector) != sum(oranges.vector)
    assert apples.vector_norm != oranges.vector_norm
    
+@pytest.mark.models
+def test_ancestors(EN):
+    # the structure of this sentence depends on the English annotation scheme
+    tokens = EN(u'Yesterday I saw a dog that barked loudly.')
+    ancestors = [ t.orth_ for t in tokens[6].ancestors ]
+    assert ancestors == ['dog','saw']
+    ancestors = [ t.orth_ for t in tokens[1].ancestors ]
+    assert ancestors == ['saw']
+    ancestors = [ t.orth_ for t in tokens[2].ancestors ]
+    assert ancestors == []
+
+    assert tokens[2].is_ancestor_of(tokens[7])
+    assert not tokens[6].is_ancestor_of(tokens[2])
+
+
+@pytest.mark.models
+def test_head_setter(EN):
+    # the structure of this sentence depends on the English annotation scheme
+    yesterday, i, saw, a, dog, that, barked, loudly, dot = EN(u'Yesterday I saw a dog that barked loudly.')
+    assert barked.n_lefts == 1
+    assert barked.n_rights == 1
+    assert barked.left_edge == that
+    assert barked.right_edge == loudly
+
+    assert dog.n_lefts == 1
+    assert dog.n_rights == 1
+    assert dog.left_edge == a
+    assert dog.right_edge == loudly
+    
+    assert a.n_lefts == 0
+    assert a.n_rights == 0
+    assert a.left_edge == a
+    assert a.right_edge == a
+
+    assert saw.left_edge == yesterday
+    assert saw.right_edge == dot
+
+    barked.head = a
+
+    assert barked.n_lefts == 1
+    assert barked.n_rights == 1
+    assert barked.left_edge == that
+    assert barked.right_edge == loudly
+
+    assert a.n_lefts == 0
+    assert a.n_rights == 1
+    assert a.left_edge == a
+    assert a.right_edge == loudly
+
+    assert dog.n_lefts == 1
+    assert dog.n_rights == 0
+    assert dog.left_edge == a
+    assert dog.right_edge == loudly
+
+    assert saw.left_edge == yesterday
+    assert saw.right_edge == dot
+
+    yesterday.head = that
+
+    assert that.left_edge == yesterday
+    assert barked.left_edge == yesterday
+    assert a.left_edge == yesterday
+    assert dog.left_edge == yesterday
+    assert saw.left_edge == yesterday
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@ -6,7 +6,7 @@ from .doc cimport Doc

 cdef class Token:
    cdef Vocab vocab
-    cdef const TokenC* c
+    cdef TokenC* c
    cdef readonly int i
    cdef int array_len
    cdef readonly Doc doc
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -142,6 +142,8 @@ cdef class Token:
    property dep:
        def __get__(self):
            return self.c.dep
+        def __set__(self, int label):
+            self.c.dep = label

    property has_vector:
        def __get__(self):
@ -250,10 +252,122 @@ cdef class Token:
        def __get__(self):
            return self.doc[self.c.r_edge]

+    property ancestors:
+        def __get__(self):
+            cdef const TokenC* head_ptr = self.c
+            # guard against infinite loop, no token can have 
+            # more ancestors than tokens in the tree
+            cdef int i = 0
+            while head_ptr.head != 0 and i < self.doc.length:
+                head_ptr += head_ptr.head
+                yield self.doc[head_ptr - (self.c - self.i)]
+                i += 1
+
+    def is_ancestor_of(self, descendant):
+        return any( ancestor.i == self.i for ancestor in descendant.ancestors )
+
    property head:
        def __get__(self):
            """The token predicted by the parser to be the head of the current token."""
            return self.doc[self.i + self.c.head]
+        def __set__(self, Token new_head):
+            # this function sets the head of self to new_head
+            # and updates the counters for left/right dependents
+            # and left/right corner for the new and the old head
+
+            # do nothing if old head is new head
+            if self.i + self.c.head == new_head.i:
+                return
+
+            cdef Token old_head = self.head
+            cdef int rel_newhead_i = new_head.i - self.i
+
+            # is the new head a descendant of the old head
+            cdef bint is_desc = old_head.is_ancestor_of(new_head)
+            
+            cdef int token_i
+            cdef int new_edge
+            cdef Token anc
+
+            # update number of deps of old head
+            if self.c.head > 0: # left dependent
+                old_head.c.l_kids -= 1
+                if self.c.l_edge == old_head.c.l_edge:
+                    # the token dominates the left edge so the left edge of the head
+                    # may change when the token is reattached
+                    # it may not change if the new head is a descendant of the current head
+
+                    # find new l_edge if new head is not a descendant of old head
+                    # a new l_edge is any token between l_edge and old_head
+                    # that is a descendant of old_head but not of self
+                    new_edge = self.c.l_edge
+                    if not is_desc:
+                        for token_i in range(old_head.l_edge+1,old_head.i):
+                            if self.doc.c[token_i].l_kids == 0: # only a token without left deps can be a left edge
+                                if self.is_ancestor_of(self.doc[token_i]):
+                                    continue
+                                if old_head.is_ancestor_of(self.doc[token_i]):
+                                    new_edge = token_i
+                                    break
+                        else: # set the new l_edge to old_head if no other was found
+                            new_edge = old_head.i
+
+                    # assign new l_edge to old_head
+                    old_head.c.l_edge = new_edge
+                    # walk up the tree from old_head and assign new l_edge to ancestors
+                    # until an ancestor already has an l_edge that's further left
+                    for anc in old_head.ancestors:
+                        if anc.c.l_edge <= new_edge:
+                            break
+                        anc.c.l_edge = new_edge
+            
+            elif self.c.head < 0: # right dependent
+                old_head.c.r_kids -= 1
+                # do the same thing as for l_edge
+                if self.c.r_edge == old_head.c.r_edge:
+                    new_edge = self.c.r_edge
+                    if not is_desc:
+                        for token_i in range(old_head.r_edge-1,old_head.i,-1):
+                            if self.doc.c[token_i].r_kids == 0:
+                                if self.is_ancestor_of(self.doc[token_i]):
+                                    continue
+                                if old_head.is_ancestor_of(self.doc[token_i]):
+                                    new_edge = token_i
+                                    break
+                        else: 
+                            new_edge = old_head.i
+                    old_head.c.r_edge = new_edge
+                    for anc in old_head.ancestors:
+                        if anc.c.r_edge >= new_edge:
+                            break
+                        anc.c.r_edge = new_edge
+
+            # update number of deps of new head
+            if rel_newhead_i > 0: # left dependent
+                new_head.c.l_kids += 1
+                # walk up the tree from new head and set l_edge to self.l_edge
+                # until you hit a token with an l_edge further to the left
+                if self.c.l_edge < new_head.c.l_edge:
+                    new_edge = self.c.l_edge
+                    new_head.c.l_edge = new_edge
+                    for anc in new_head.ancestors:
+                        if anc.c.l_edge <= new_edge:
+                            break
+                        anc.c.l_edge = new_edge
+
+            elif rel_newhead_i < 0: # right dependent
+                new_head.c.r_kids += 1
+                # do the same as for l_edge
+                if self.c.r_edge > new_head.c.r_edge:
+                    new_edge = self.c.r_edge
+                    new_head.c.r_edge = new_edge
+                    for anc in new_head.ancestors:
+                        if anc.c.r_edge >= new_edge:
+                            break
+                        anc.c.r_edge = new_edge
+
+            # set new head
+            self.c.head = rel_newhead_i

    property conjuncts:
        def __get__(self):
@ -325,6 +439,8 @@ cdef class Token:
    property dep_:
        def __get__(self):
            return self.vocab.strings[self.c.dep]
+        def __set__(self, unicode label):
+            self.c.dep = self.vocab.strings[label]

    property is_oov:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)