diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 976fe7910..56f38766a 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -137,7 +137,7 @@ def _print_docs_to_stdout(docs, output_type):
     if output_type == "json":
         srsly.write_json("-", docs_to_json(docs))
     else:
-        sys.stdout.buffer.write(DocBin(docs=docs).to_bytes())
+        sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
 
 
 def _write_docs_to_file(docs, output_file, output_type):
@@ -146,7 +146,7 @@ def _write_docs_to_file(docs, output_file, output_type):
     if output_type == "json":
         srsly.write_json(output_file, docs_to_json(docs))
     else:
-        data = DocBin(docs=docs).to_bytes()
+        data = DocBin(docs=docs, store_user_data=True).to_bytes()
         with output_file.open("wb") as file_:
             file_.write(data)
  
diff --git a/spacy/gold/converters/json2docs.py b/spacy/gold/converters/json2docs.py
index 50ad16faf..342f94848 100644
--- a/spacy/gold/converters/json2docs.py
+++ b/spacy/gold/converters/json2docs.py
@@ -17,8 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
         for json_para in json_to_annotations(json_doc):
             example_dict = _fix_legacy_dict_data(json_para)
             tok_dict, doc_dict = _parse_example_dict_data(example_dict)
-            if json_para.get("raw"):
-                assert tok_dict.get("SPACY")
             doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
             docs.append(doc)
     return docs
diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
index 9a688987c..64f38d21c 100644
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@@ -43,25 +43,36 @@ class Corpus:
                 locs.append(path)
         return locs
 
+    def _make_example(self, nlp, reference, gold_preproc):
+        if gold_preproc or reference.has_unknown_spaces:
+            return Example(
+                Doc(
+                    nlp.vocab,
+                    words=[word.text for word in reference],
+                    spaces=[bool(word.whitespace_) for word in reference]
+                ),
+                reference
+            )
+        else:
+            return Example(
+                nlp.make_doc(reference.text),
+                reference
+            )
+ 
     def make_examples(self, nlp, reference_docs, max_length=0):
         for reference in reference_docs:
             if len(reference) == 0:
                 continue
             elif max_length == 0 or len(reference) < max_length:
-                yield Example(
-                    nlp.make_doc(reference.text),
-                    reference
-                )
+                yield self._make_example(nlp, reference, False)
             elif reference.is_sentenced:
                 for ref_sent in reference.sents:
                     if len(ref_sent) == 0:
                         continue
                     elif max_length == 0 or len(ref_sent) < max_length:
-                        yield Example(
-                            nlp.make_doc(ref_sent.text),
-                            ref_sent.as_doc()
-                        )
+                        yield self._make_example(nlp, ref_sent.as_doc(), False)
     
+
     def make_examples_gold_preproc(self, nlp, reference_docs):
         for reference in reference_docs:
             if reference.is_sentenced:
@@ -69,14 +80,7 @@ class Corpus:
             else:
                 ref_sents = [reference]
             for ref_sent in ref_sents:
-                eg = Example(
-                    Doc(
-                        nlp.vocab, 
-                        words=[w.text for w in ref_sent],
-                        spaces=[bool(w.whitespace_) for w in ref_sent]
-                    ),
-                    ref_sent
-                )
+                eg = self._make_example(nlp, ref_sent, True)
                 if len(eg.x):
                     yield eg
 
diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index 2ecee1821..7b629dcd2 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -15,7 +15,7 @@ from ..syntax import nonproj
 
 
 cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
-    """ Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """
+    """ Create a Doc from dictionaries with token and doc annotations. """
     attrs, array = _annot2array(vocab, tok_annot, doc_annot)
     output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
     if "entities" in doc_annot:
@@ -406,7 +406,7 @@ def _parse_links(vocab, words, spaces, links):
 
 def _guess_spaces(text, words):
     if text is None:
-        return [True] * len(words)
+        return None
     spaces = []
     text_pos = 0
     # align words with text
diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py
index 615bb1cd9..85c21f7f9 100644
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@@ -75,3 +75,19 @@ def test_serialize_doc_bin():
     for i, doc in enumerate(reloaded_docs):
         assert doc.text == texts[i]
         assert doc.cats == cats
+
+
+def test_serialize_doc_bin_unknown_spaces(en_vocab):
+    doc1 = Doc(en_vocab, words=["that", "'s"])
+    assert doc1.has_unknown_spaces
+    assert doc1.text == "that 's "
+    doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
+    assert not doc2.has_unknown_spaces
+    assert doc2.text == "that's"
+
+    doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
+    re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
+    assert re_doc1.has_unknown_spaces
+    assert re_doc1.text == "that 's "
+    assert not re_doc2.has_unknown_spaces
+    assert re_doc2.text == "that's"
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index a56900988..edc183e0d 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -61,6 +61,7 @@ class DocBin(object):
         self.spaces = []
         self.cats = []
         self.user_data = []
+        self.flags = []
         self.strings = set()
         self.store_user_data = store_user_data
         for doc in docs:
@@ -85,6 +86,9 @@ class DocBin(object):
         assert array.shape[0] == spaces.shape[0]  # this should never happen
         spaces = spaces.reshape((spaces.shape[0], 1))
         self.spaces.append(numpy.asarray(spaces, dtype=bool))
+        self.flags.append({
+            "has_unknown_spaces": doc.has_unknown_spaces
+        })
         for token in doc:
             self.strings.add(token.text)
             self.strings.add(token.tag_)
@@ -109,8 +113,11 @@ class DocBin(object):
             vocab[string]
         orth_col = self.attrs.index(ORTH)
         for i in range(len(self.tokens)):
+            flags = self.flags[i]
             tokens = self.tokens[i]
             spaces = self.spaces[i]
+            if flags.get("has_unknown_spaces"):
+                spaces = None
             doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
             doc = doc.from_array(self.attrs, tokens)
             doc.cats = self.cats[i]
@@ -134,6 +141,7 @@ class DocBin(object):
         self.spaces.extend(other.spaces)
         self.strings.update(other.strings)
         self.cats.extend(other.cats)
+        self.flags.extend(other.flags)
         if self.store_user_data:
             self.user_data.extend(other.user_data)
 
@@ -158,6 +166,7 @@ class DocBin(object):
             "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
             "strings": list(self.strings),
             "cats": self.cats,
+            "flags": self.flags,
         }
         if self.store_user_data:
             msg["user_data"] = self.user_data
@@ -183,6 +192,7 @@ class DocBin(object):
         self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
         self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
         self.cats = msg["cats"]
+        self.flags = msg.get("flags", [{} for _ in lengths])
         if self.store_user_data and "user_data" in msg:
             self.user_data = list(msg["user_data"])
         for tokens in self.tokens:
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 42918ab6d..2775aa97e 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -59,11 +59,14 @@ cdef class Doc:
     cdef public dict user_token_hooks
     cdef public dict user_span_hooks
 
+    cdef public bint has_unknown_spaces
+
     cdef public list _py_tokens
 
     cdef int length
     cdef int max_length
 
+
     cdef public object noun_chunks_iterator
 
     cdef object __weakref__
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index f69a6811d..723873e1f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -172,8 +172,7 @@ cdef class Doc:
             raise ValueError(Errors.E046.format(name=name))
         return Underscore.doc_extensions.pop(name)
 
-    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
-                 orths_and_spaces=None):
+    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
         """Create a Doc object.
 
         vocab (Vocab): A vocabulary object, which must match any models you
@@ -215,28 +214,25 @@ cdef class Doc:
         self._vector = None
         self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
         cdef bint has_space
-        if orths_and_spaces is None and words is not None:
-            if spaces is None:
-                spaces = [True] * len(words)
-            elif len(spaces) != len(words):
-                raise ValueError(Errors.E027)
-            orths_and_spaces = zip(words, spaces)
+        if words is None and spaces is not None:
+            raise ValueError("words must be set if spaces is set")
+        elif spaces is None and words is not None:
+            self.has_unknown_spaces = True
+        else:
+            self.has_unknown_spaces = False
+        words = words if words is not None else []
+        spaces = spaces if spaces is not None else ([True] * len(words))
+        if len(spaces) != len(words):
+            raise ValueError(Errors.E027)
         cdef const LexemeC* lexeme
-        if orths_and_spaces is not None:
-            orths_and_spaces = list(orths_and_spaces)
-            for orth_space in orths_and_spaces:
-                if isinstance(orth_space, unicode):
-                    lexeme = self.vocab.get(self.mem, orth_space)
-                    has_space = True
-                elif isinstance(orth_space, bytes):
-                    raise ValueError(Errors.E028.format(value=orth_space))
-                elif isinstance(orth_space[0], unicode):
-                    lexeme = self.vocab.get(self.mem, orth_space[0])
-                    has_space = orth_space[1]
-                else:
-                    lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
-                    has_space = orth_space[1]
-                self.push_back(lexeme, has_space)
+        for word, has_space in zip(words, spaces):
+            if isinstance(word, unicode):
+                lexeme = self.vocab.get(self.mem, word)
+            elif isinstance(word, bytes):
+                raise ValueError(Errors.E028.format(value=word))
+            else:
+                lexeme = self.vocab.get_by_orth(self.mem, word)
+            self.push_back(lexeme, has_space)
         # Tough to decide on policy for this. Is an empty doc tagged and parsed?
         # There's no information we'd like to add to it, so I guess so?
         if self.length == 0:
@@ -1082,6 +1078,7 @@ cdef class Doc:
             "sentiment": lambda: self.sentiment,
             "tensor": lambda: self.tensor,
             "cats": lambda: self.cats,
+            "has_unknown_spaces": lambda: self.has_unknown_spaces
         }
         for key in kwargs:
             if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
@@ -1114,6 +1111,7 @@ cdef class Doc:
             "cats": lambda b: None,
             "user_data_keys": lambda b: None,
             "user_data_values": lambda b: None,
+            "has_unknown_spaces": lambda b: None
         }
         for key in kwargs:
             if key in deserializers or key in ("user_data",):
@@ -1134,6 +1132,8 @@ cdef class Doc:
             self.tensor = msg["tensor"]
         if "cats" not in exclude and "cats" in msg:
             self.cats = msg["cats"]
+        if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg:
+            self.has_unknown_spaces = msg["has_unknown_spaces"]
         start = 0
         cdef const LexemeC* lex
         cdef unicode orth_