Fix dependency copy for as_doc (#3969)

* failing unit test for issue 3962 * attempt to fix Issue #3962 * create artificial unit test example * using length instead of self.length * sp * reformat with black * find better ancestor within span and use generic 'dep' * attach to span.root if there is no appropriate ancestor * comment span text * clean up ancestor code * reconstruct dep tree to keep same number of sentences
2025-11-03 09:27:56 +03:00 · 2019-07-23 18:28:55 +02:00 · 2019-07-23 18:28:55 +02:00 · ba02957c80
commit ba02957c80
parent a32b033b8c
3 changed files with 157 additions and 5 deletions
--- a/spacy/tests/regression/test_issue3962.py
+++ b/spacy/tests/regression/test_issue3962.py
@ -0,0 +1,112 @@
 # coding: utf8
 from __future__ import unicode_literals
 import pytest
 from ..util import get_doc
@pytest.fixture
 def doc(en_tokenizer):
    text = "He jests at scars, that never felt a wound."
    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
    deps = [
        "nsubj",
        "ccomp",
        "prep",
        "pobj",
        "punct",
        "nsubj",
        "neg",
        "ROOT",
        "det",
        "dobj",
        "punct",
    ]
    tokens = en_tokenizer(text)
    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 def test_issue3962(doc):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    span2 = doc[1:5]  # "jests at scars ,"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json
    assert doc2[0].head.text == "jests"  # head set to itself, being the new artificial root
    assert doc2[0].dep_ == "dep"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"  # head set to the new artificial root
    assert doc2[3].dep_ == "dep"
    # We should still have 1 sentence
    assert len(list(doc2.sents)) == 1
    span3 = doc[6:9]  # "never felt a"
    doc3 = span3.as_doc()
    doc3_json = doc3.to_json()
    assert doc3_json
    assert doc3[0].head.text == "felt"
    assert doc3[0].dep_ == "neg"
    assert doc3[1].head.text == "felt"
    assert doc3[1].dep_ == "ROOT"
    assert doc3[2].head.text == "felt"  # head set to ancestor
    assert doc3[2].dep_ == "dep"
    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
    assert len(list(doc3.sents)) == 1
@pytest.fixture
 def two_sent_doc(en_tokenizer):
    text = "He jests at scars. They never felt a wound."
    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
    deps = [
        "nsubj",
        "ROOT",
        "prep",
        "pobj",
        "punct",
        "nsubj",
        "neg",
        "ROOT",
        "det",
        "dobj",
        "punct",
    ]
    tokens = en_tokenizer(text)
    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 def test_issue3962_long(two_sent_doc):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json
    assert doc2[0].head.text == "jests"  # head set to itself, being the new artificial root (in sentence 1)
    assert doc2[0].dep_ == "ROOT"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"
    assert doc2[3].dep_ == "punct"
    assert doc2[4].head.text == "They"  # head set to itself, being the new artificial root (in sentence 2)
    assert doc2[4].dep_ == "dep"
    assert doc2[4].head.text == "They"  # head set to the new artificial head (in sentence 2)
    assert doc2[4].dep_ == "dep"
    # We should still have 2 sentences
    sents = list(doc2.sents)
    assert len(sents) == 2
    assert sents[0].text == "jests at scars ."
    assert sents[1].text == "They never"
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -794,7 +794,7 @@ cdef class Doc:
                if array[i, col] != 0:
                    self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
        # Now load the data
-        for i in range(self.length):
+        for i in range(length):
            token = &self.c[i]
            for j in range(n_attrs):
                if attr_ids[j] != TAG:
@ -804,7 +804,7 @@ cdef class Doc:
        self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
        # If document is parsed, set children
        if self.is_parsed:
-            set_children_from_heads(self.c, self.length)
+            set_children_from_heads(self.c, length)
        return self
    def get_lca_matrix(self):
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -17,6 +17,7 @@ from ..attrs cimport attr_id_t
 from ..parts_of_speech cimport univ_pos_t
 from ..attrs cimport *
 from ..lexeme cimport Lexeme
 from ..symbols cimport dep
 from ..util import normalize_slice
 from ..compat import is_config, basestring_
@ -206,7 +207,6 @@ cdef class Span:
        DOCS: https://spacy.io/api/span#as_doc
        """
        # TODO: Fix!
        words = [t.text for t in self]
        spaces = [bool(t.whitespace_) for t in self]
        cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
@ -220,7 +220,9 @@ cdef class Span:
        else:
            array_head.append(SENT_START)
        array = self.doc.to_array(array_head)
-        doc.from_array(array_head, array[self.start : self.end])
+        array = array[self.start : self.end]
        self._fix_dep_copy(array_head, array)
        doc.from_array(array_head, array)
        doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
        doc.user_hooks = self.doc.user_hooks
        doc.user_span_hooks = self.doc.user_span_hooks
@ -235,6 +237,44 @@ cdef class Span:
                    doc.cats[cat_label] = value
        return doc
    def _fix_dep_copy(self, attrs, array):
        """ Rewire dependency links to make sure their heads fall into the span
        while still keeping the correct number of sentences. """
        cdef int length = len(array)
        cdef attr_t value
        cdef int i, head_col, ancestor_i
        old_to_new_root = dict()
        if HEAD in attrs:
            head_col = attrs.index(HEAD)
            for i in range(length):
                # if the HEAD refers to a token outside this span, find a more appropriate ancestor
                token = self[i]
                ancestor_i = token.head.i - self.start   # span offset
                if ancestor_i not in range(length):
                    if DEP in attrs:
                        array[i, attrs.index(DEP)] = dep
                    # try finding an ancestor within this span
                    ancestors = token.ancestors
                    for ancestor in ancestors:
                        ancestor_i = ancestor.i - self.start
                        if ancestor_i in range(length):
                            array[i, head_col] = ancestor_i - i
                # if there is no appropriate ancestor, define a new artificial root
                value = array[i, head_col]
                if (i+value) not in range(length):
                    new_root = old_to_new_root.get(ancestor_i, None)
                    if new_root is not None:
                        # take the same artificial root as a previous token from the same sentence
                        array[i, head_col] = new_root - i
                    else:
                        # set this token as the new artificial root
                        array[i, head_col] = 0
                        old_to_new_root[ancestor_i] = i
        return array
    def merge(self, *args, **attributes):
        """Retokenize the document, such that the span is merged into a single
        token.
@ -500,7 +540,7 @@ cdef class Span:
        if "root" in self.doc.user_span_hooks:
            return self.doc.user_span_hooks["root"](self)
        # This should probably be called 'head', and the other one called
-        # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
+        # 'gov'. But we went with 'head' elsewhere, and now we're stuck =/
        cdef int i
        # First, we scan through the Span, and check whether there's a word
        # with head==0, i.e. a sentence root. If so, we can return it. The