Fix dependency copy for as_doc (#3969)

* failing unit test for issue 3962 * attempt to fix Issue #3962 * create artificial unit test example * using length instead of self.length * sp * reformat with black * find better ancestor within span and use generic 'dep' * attach to span.root if there is no appropriate ancestor * comment span text * clean up ancestor code * reconstruct dep tree to keep same number of sentences
2026-02-01 21:16:05 +03:00 · 2019-07-23 18:28:55 +02:00 · 2019-07-23 18:28:55 +02:00 · ba02957c80
commit ba02957c80
parent a32b033b8c
3 changed files with 157 additions and 5 deletions
--- a/spacy/tests/regression/test_issue3962.py
+++ b/spacy/tests/regression/test_issue3962.py
@ -0,0 +1,112 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+from ..util import get_doc
+
+
+@pytest.fixture
+def doc(en_tokenizer):
+    text = "He jests at scars, that never felt a wound."
+    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
+    deps = [
+        "nsubj",
+        "ccomp",
+        "prep",
+        "pobj",
+        "punct",
+        "nsubj",
+        "neg",
+        "ROOT",
+        "det",
+        "dobj",
+        "punct",
+    ]
+    tokens = en_tokenizer(text)
+    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+
+
+def test_issue3962(doc):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    span2 = doc[1:5]  # "jests at scars ,"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+
+    assert doc2[0].head.text == "jests"  # head set to itself, being the new artificial root
+    assert doc2[0].dep_ == "dep"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"  # head set to the new artificial root
+    assert doc2[3].dep_ == "dep"
+
+    # We should still have 1 sentence
+    assert len(list(doc2.sents)) == 1
+
+    span3 = doc[6:9]  # "never felt a"
+    doc3 = span3.as_doc()
+    doc3_json = doc3.to_json()
+    assert doc3_json
+
+    assert doc3[0].head.text == "felt"
+    assert doc3[0].dep_ == "neg"
+    assert doc3[1].head.text == "felt"
+    assert doc3[1].dep_ == "ROOT"
+    assert doc3[2].head.text == "felt"  # head set to ancestor
+    assert doc3[2].dep_ == "dep"
+
+    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
+    assert len(list(doc3.sents)) == 1
+
+
+@pytest.fixture
+def two_sent_doc(en_tokenizer):
+    text = "He jests at scars. They never felt a wound."
+    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
+    deps = [
+        "nsubj",
+        "ROOT",
+        "prep",
+        "pobj",
+        "punct",
+        "nsubj",
+        "neg",
+        "ROOT",
+        "det",
+        "dobj",
+        "punct",
+    ]
+    tokens = en_tokenizer(text)
+    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+
+
+def test_issue3962_long(two_sent_doc):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+
+    assert doc2[0].head.text == "jests"  # head set to itself, being the new artificial root (in sentence 1)
+    assert doc2[0].dep_ == "ROOT"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"
+    assert doc2[3].dep_ == "punct"
+    assert doc2[4].head.text == "They"  # head set to itself, being the new artificial root (in sentence 2)
+    assert doc2[4].dep_ == "dep"
+    assert doc2[4].head.text == "They"  # head set to the new artificial head (in sentence 2)
+    assert doc2[4].dep_ == "dep"
+
+    # We should still have 2 sentences
+    sents = list(doc2.sents)
+    assert len(sents) == 2
+    assert sents[0].text == "jests at scars ."
+    assert sents[1].text == "They never"
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -794,7 +794,7 @@ cdef class Doc:
                if array[i, col] != 0:
                    self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
        # Now load the data
-        for i in range(self.length):
+        for i in range(length):
            token = &self.c[i]
            for j in range(n_attrs):
                if attr_ids[j] != TAG:
@ -804,7 +804,7 @@ cdef class Doc:
        self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
        # If document is parsed, set children
        if self.is_parsed:
-            set_children_from_heads(self.c, self.length)
+            set_children_from_heads(self.c, length)
        return self

    def get_lca_matrix(self):
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -17,6 +17,7 @@ from ..attrs cimport attr_id_t
 from ..parts_of_speech cimport univ_pos_t
 from ..attrs cimport *
 from ..lexeme cimport Lexeme
+from ..symbols cimport dep

 from ..util import normalize_slice
 from ..compat import is_config, basestring_
@ -206,7 +207,6 @@ cdef class Span:

        DOCS: https://spacy.io/api/span#as_doc
        """
-        # TODO: Fix!
        words = [t.text for t in self]
        spaces = [bool(t.whitespace_) for t in self]
        cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
@ -220,7 +220,9 @@ cdef class Span:
        else:
            array_head.append(SENT_START)
        array = self.doc.to_array(array_head)
-        doc.from_array(array_head, array[self.start : self.end])
+        array = array[self.start : self.end]
+        self._fix_dep_copy(array_head, array)
+        doc.from_array(array_head, array)
        doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
        doc.user_hooks = self.doc.user_hooks
        doc.user_span_hooks = self.doc.user_span_hooks
@ -235,6 +237,44 @@ cdef class Span:
                    doc.cats[cat_label] = value
        return doc

+    def _fix_dep_copy(self, attrs, array):
+        """ Rewire dependency links to make sure their heads fall into the span
+        while still keeping the correct number of sentences. """
+        cdef int length = len(array)
+        cdef attr_t value
+        cdef int i, head_col, ancestor_i
+        old_to_new_root = dict()
+        if HEAD in attrs:
+            head_col = attrs.index(HEAD)
+            for i in range(length):
+                # if the HEAD refers to a token outside this span, find a more appropriate ancestor
+                token = self[i]
+                ancestor_i = token.head.i - self.start   # span offset
+                if ancestor_i not in range(length):
+                    if DEP in attrs:
+                        array[i, attrs.index(DEP)] = dep
+
+                    # try finding an ancestor within this span
+                    ancestors = token.ancestors
+                    for ancestor in ancestors:
+                        ancestor_i = ancestor.i - self.start
+                        if ancestor_i in range(length):
+                            array[i, head_col] = ancestor_i - i
+
+                # if there is no appropriate ancestor, define a new artificial root
+                value = array[i, head_col]
+                if (i+value) not in range(length):
+                    new_root = old_to_new_root.get(ancestor_i, None)
+                    if new_root is not None:
+                        # take the same artificial root as a previous token from the same sentence
+                        array[i, head_col] = new_root - i
+                    else:
+                        # set this token as the new artificial root
+                        array[i, head_col] = 0
+                        old_to_new_root[ancestor_i] = i
+
+        return array
+
    def merge(self, *args, **attributes):
        """Retokenize the document, such that the span is merged into a single
        token.
@ -500,7 +540,7 @@ cdef class Span:
        if "root" in self.doc.user_span_hooks:
            return self.doc.user_span_hooks["root"](self)
        # This should probably be called 'head', and the other one called
-        # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
+        # 'gov'. But we went with 'head' elsewhere, and now we're stuck =/
        cdef int i
        # First, we scan through the Span, and check whether there's a word
        # with head==0, i.e. a sentence root. If so, we can return it. The