Merge pull request #287 from wbwseeker/deproj_sentbnd_bug

add function to Token for setting head and dep (and dep_)
2025-10-24 04:31:17 +03:00 · 2016-03-25 09:47:45 +11:00 · 2016-03-25 09:47:45 +11:00 · 4a37fdcee1
commit 4a37fdcee1
parent a862edc0e6 2ae253ef5b
4 changed files with 183 additions and 9 deletions
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -118,15 +118,18 @@ class PseudoProjectivity:
        # reattach arcs with decorated labels (following HEAD scheme)
        # for each decorated arc X||Y, search top-down, left-to-right,
        # breadth-first until hitting a Y then make this the new head
-        parse = tokens.to_array([HEAD, DEP])
+        #parse = tokens.to_array([HEAD, DEP])
        labels = [ tokens.vocab.strings[int(p[1])] for p in parse ]
        for token in tokens:
            if cls.is_decorated(token.dep_):
                newlabel,headlabel = cls.decompose(token.dep_)
                newhead = cls._find_new_head(token,headlabel)
-                parse[token.i,1] = tokens.vocab.strings[newlabel]
+                token.head = newhead
-                parse[token.i,0] = newhead.i - token.i
+                token.dep_ = newlabel
-        tokens.from_array([HEAD, DEP],parse)
+
                # tokens.attach(token,newhead,newlabel)
                #parse[token.i,1] = tokens.vocab.strings[newlabel]
                #parse[token.i,0] = newhead.i - token.i
        #tokens.from_array([HEAD, DEP],parse)
    @classmethod
@ -168,7 +171,7 @@ class PseudoProjectivity:
    @classmethod
    def _find_new_head(cls, token, headlabel):
-        # search through the tree starting from root
+        # search through the tree starting from the head of the given token
        # returns the id of the first descendant with the given label
        # if there is none, return the current head (no change)
        queue = [token.head]
@ -176,8 +179,8 @@ class PseudoProjectivity:
            next_queue = []
            for qtoken in queue:
                for child in qtoken.children:
-                    if child == token:
+                    if child.is_space: continue                        
-                        continue
+                    if child == token: continue
                    if child.dep_ == headlabel:
                        return child
                    next_queue.append(child)
--- a/spacy/tests/tokens/test_token_api.py
+++ b/spacy/tests/tokens/test_token_api.py
@ -62,3 +62,67 @@ def test_vectors(EN):
    assert sum(apples.vector) != sum(oranges.vector)
    assert apples.vector_norm != oranges.vector_norm
@pytest.mark.models
 def test_ancestors(EN):
    # the structure of this sentence depends on the English annotation scheme
    tokens = EN(u'Yesterday I saw a dog that barked loudly.')
    ancestors = [ t.orth_ for t in tokens[6].ancestors ]
    assert ancestors == ['dog','saw']
    ancestors = [ t.orth_ for t in tokens[1].ancestors ]
    assert ancestors == ['saw']
    ancestors = [ t.orth_ for t in tokens[2].ancestors ]
    assert ancestors == []
    assert tokens[2].is_ancestor_of(tokens[7])
    assert not tokens[6].is_ancestor_of(tokens[2])
@pytest.mark.models
 def test_head_setter(EN):
    # the structure of this sentence depends on the English annotation scheme
    yesterday, i, saw, a, dog, that, barked, loudly, dot = EN(u'Yesterday I saw a dog that barked loudly.')
    assert barked.n_lefts == 1
    assert barked.n_rights == 1
    assert barked.left_edge == that
    assert barked.right_edge == loudly
    assert dog.n_lefts == 1
    assert dog.n_rights == 1
    assert dog.left_edge == a
    assert dog.right_edge == loudly
    assert a.n_lefts == 0
    assert a.n_rights == 0
    assert a.left_edge == a
    assert a.right_edge == a
    assert saw.left_edge == yesterday
    assert saw.right_edge == dot
    barked.head = a
    assert barked.n_lefts == 1
    assert barked.n_rights == 1
    assert barked.left_edge == that
    assert barked.right_edge == loudly
    assert a.n_lefts == 0
    assert a.n_rights == 1
    assert a.left_edge == a
    assert a.right_edge == loudly
    assert dog.n_lefts == 1
    assert dog.n_rights == 0
    assert dog.left_edge == a
    assert dog.right_edge == loudly
    assert saw.left_edge == yesterday
    assert saw.right_edge == dot
    yesterday.head = that
    assert that.left_edge == yesterday
    assert barked.left_edge == yesterday
    assert a.left_edge == yesterday
    assert dog.left_edge == yesterday
    assert saw.left_edge == yesterday
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@ -6,7 +6,7 @@ from .doc cimport Doc
 cdef class Token:
    cdef Vocab vocab
-    cdef const TokenC* c
+    cdef TokenC* c
    cdef readonly int i
    cdef int array_len
    cdef readonly Doc doc
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -142,6 +142,8 @@ cdef class Token:
    property dep:
        def __get__(self):
            return self.c.dep
        def __set__(self, int label):
            self.c.dep = label
    property has_vector:
        def __get__(self):
@ -250,10 +252,113 @@ cdef class Token:
        def __get__(self):
            return self.doc[self.c.r_edge]
    property ancestors:
        def __get__(self):
            cdef const TokenC* head_ptr = self.c
            # guard against infinite loop, no token can have 
            # more ancestors than tokens in the tree
            cdef int i = 0
            while head_ptr.head != 0 and i < self.doc.length:
                head_ptr += head_ptr.head
                yield self.doc[head_ptr - (self.c - self.i)]
                i += 1
    def is_ancestor_of(self, descendant):
        return any( ancestor.i == self.i for ancestor in descendant.ancestors )
    property head:
        def __get__(self):
            """The token predicted by the parser to be the head of the current token."""
            return self.doc[self.i + self.c.head]
        def __set__(self, Token new_head):
            # this function sets the head of self to new_head
            # and updates the counters for left/right dependents
            # and left/right corner for the new and the old head
            # do nothing if old head is new head
            if self.i + self.c.head == new_head.i:
                return
            cdef Token old_head = self.head
            cdef int rel_newhead_i = new_head.i - self.i
            # is the new head a descendant of the old head
            cdef bint is_desc = old_head.is_ancestor_of(new_head)
            cdef int new_edge
            cdef Token anc, child
            # update number of deps of old head
            if self.c.head > 0: # left dependent
                old_head.c.l_kids -= 1
                if self.c.l_edge == old_head.c.l_edge:
                    # the token dominates the left edge so the left edge of the head
                    # may change when the token is reattached
                    # it may not change if the new head is a descendant of the current head
                    new_edge = self.c.l_edge
                    # the new l_edge is the left-most l_edge on any of the other dependents
                    # where the l_edge is left of the head, otherwise it is the head
                    if not is_desc:
                        new_edge = old_head.i
                        for child in old_head.children:
                            if child == self:
                                continue
                            if child.c.l_edge < new_edge:
                                new_edge = child.c.l_edge
                        old_head.c.l_edge = new_edge
                    # walk up the tree from old_head and assign new l_edge to ancestors
                    # until an ancestor already has an l_edge that's further left
                    for anc in old_head.ancestors:
                        if anc.c.l_edge <= new_edge:
                            break
                        anc.c.l_edge = new_edge
            elif self.c.head < 0: # right dependent
                old_head.c.r_kids -= 1
                # do the same thing as for l_edge
                if self.c.r_edge == old_head.c.r_edge:
                    new_edge = self.c.r_edge
                    if not is_desc:
                        new_edge = old_head.i
                        for child in old_head.children:
                            if child == self:
                                continue
                            if child.c.r_edge > new_edge:
                                new_edge = child.c.r_edge
                        old_head.c.r_edge = new_edge
                    for anc in old_head.ancestors:
                        if anc.c.r_edge >= new_edge:
                            break
                        anc.c.r_edge = new_edge
            # update number of deps of new head
            if rel_newhead_i > 0: # left dependent
                new_head.c.l_kids += 1
                # walk up the tree from new head and set l_edge to self.l_edge
                # until you hit a token with an l_edge further to the left
                if self.c.l_edge < new_head.c.l_edge:
                    new_head.c.l_edge = self.c.l_edge
                    for anc in new_head.ancestors:
                        if anc.c.l_edge <= self.c.l_edge:
                            break
                        anc.c.l_edge = self.c.l_edge
            elif rel_newhead_i < 0: # right dependent
                new_head.c.r_kids += 1
                # do the same as for l_edge
                if self.c.r_edge > new_head.c.r_edge:
                    new_head.c.r_edge = self.c.r_edge
                    for anc in new_head.ancestors:
                        if anc.c.r_edge >= self.c.r_edge:
                            break
                        anc.c.r_edge = self.c.r_edge
            # set new head
            self.c.head = rel_newhead_i
    property conjuncts:
        def __get__(self):
@ -325,6 +430,8 @@ cdef class Token:
    property dep_:
        def __get__(self):
            return self.vocab.strings[self.c.dep]
        def __set__(self, unicode label):
            self.c.dep = self.vocab.strings[label]
    property is_oov:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)