From 63b7accd746c608839a9e9dbaab6c2b35597e821 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 30 Dec 2018 15:17:46 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Make=20span.as=5Fdoc()=20return?=
 =?UTF-8?q?=20a=20copy,=20not=20a=20view.=20Closes=20#1537=20(#3107)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Initially span.as_doc() was designed to return a view of the span's contents, as a Doc object. This was a nice idea, but it fails due to the token.idx property, which refers to the character offset within the string. In a span, the idx of the first token might not be 0. Because this data is different, we can't have a view --- it'll be inconsistent.

This patch changes span.as_doc() to instead return a copy. The docs are updated accordingly. Closes #1537

* Update test for span.as_doc()

* Make span.as_doc() return a copy. Closes #1537

* Document change to Span.as_doc()
---
 spacy/tests/doc/test_span.py |  3 +++
 spacy/tokens/span.pyx        | 29 +++++++++++++++++++----------
 website/api/span.jade        |  4 ++--
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 11fc0f228..495ed47be 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -152,6 +152,9 @@ def test_span_as_doc(doc):
     span = doc[4:10]
     span_doc = span.as_doc()
     assert span.text == span_doc.text.strip()
+    assert isinstance(span_doc, doc.__class__)
+    assert span_doc is not doc
+    assert span_doc[0].idx == 0
 
 
 def test_span_string_label(doc):
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 44ce04f34..440cf1859 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -14,7 +14,7 @@ from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
 from ..parts_of_speech cimport univ_pos_t
 from ..util import normalize_slice
-from ..attrs cimport IS_PUNCT, IS_SPACE
+from ..attrs cimport *
 from ..lexeme cimport Lexeme
 from ..compat import is_config, basestring_
 from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
@@ -149,23 +149,32 @@ cdef class Span:
 
     def as_doc(self):
         # TODO: fix
-        """Create a `Doc` object view of the Span's data. This is mostly
-        useful for C-typed interfaces.
+        """Create a `Doc` object with a copy of the Span's data.
 
-        RETURNS (Doc): The `Doc` view of the span.
+        RETURNS (Doc): The `Doc` copy of the span.
         """
-        cdef Doc doc = Doc(self.doc.vocab)
-        doc.length = self.end-self.start
-        doc.c = &self.doc.c[self.start]
-        doc.mem = self.doc.mem
-        doc.is_parsed = self.doc.is_parsed
-        doc.is_tagged = self.doc.is_tagged
+        cdef Doc doc = Doc(self.doc.vocab,
+            words=[t.text for t in self],
+            spaces=[bool(t.whitespace_) for t in self])
+        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]
+        if self.doc.is_tagged:
+            array_head.append(TAG)
+        # if doc parsed add head and dep attribute
+        if self.doc.is_parsed:
+            array_head.extend([HEAD, DEP])
+        # otherwise add sent_start
+        else:
+            array_head.append(SENT_START)
+        array = self.doc.to_array(array_head)
+        doc.from_array(array_head, array[self.start : self.end])
+ 
         doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
         doc.user_hooks = self.doc.user_hooks
         doc.user_span_hooks = self.doc.user_span_hooks
         doc.user_token_hooks = self.doc.user_token_hooks
         doc.vector = self.vector
         doc.vector_norm = self.vector_norm
+        doc.tensor = self.doc.tensor[self.start : self.end]
         for key, value in self.doc.cats.items():
             if hasattr(key, '__len__') and len(key) == 3:
                 cat_start, cat_end, cat_label = key
diff --git a/website/api/span.jade b/website/api/span.jade
index e13fa29a5..7b098123d 100644
--- a/website/api/span.jade
+++ b/website/api/span.jade
@@ -377,8 +377,8 @@ p
 +h(2, "as_doc") Span.as_doc
 
 p
-    |  Create a #[code Doc] object view of the #[code Span]'s data. Mostly
-    |  useful for C-typed interfaces.
+    |  Create a new #[code Doc] object corresponding to the #[code Span], with
+    |  a copy of the data.
 
 +aside-code("Example").
     doc = nlp(u'I like New York in Autumn.')