diff --git a/setup.py b/setup.py
index b453ec1f4..c295e05cf 100755
--- a/setup.py
+++ b/setup.py
@@ -37,6 +37,7 @@ MOD_NAMES = [
     'spacy.tokens.doc',
     'spacy.tokens.span',
     'spacy.tokens.token',
+    'spacy.tokens._retokenize',
     'spacy.matcher',
     'spacy.syntax.ner',
     'spacy.symbols',
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
new file mode 100644
index 000000000..00f724ed6
--- /dev/null
+++ b/spacy/tokens/_retokenize.pyx
@@ -0,0 +1,129 @@
+# coding: utf8
+# cython: infer_types=True
+# cython: bounds_check=False
+# cython: profile=True
+from __future__ import unicode_literals
+
+from libc.string cimport memcpy, memset
+
+from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end
+from .span cimport Span
+from .token cimport Token
+from ..lexeme cimport Lexeme, EMPTY_LEXEME
+from ..structs cimport LexemeC, TokenC
+from ..attrs cimport *
+
+
+cdef class Retokenizer:
+    '''Helper class for doc.retokenize() context manager.'''
+    cdef Doc doc
+    cdef list merges
+    cdef list splits
+    def __init__(self, doc):
+        self.doc = doc
+        self.merges = []
+        self.splits = []
+
+    def merge(self, Span span, attrs=None):
+        '''Mark a span for merging. The attrs will be applied to the resulting
+        token.'''
+        self.merges.append((span.start_char, span.end_char, attrs))
+
+    def split(self, Token token, orths, attrs=None):
+        '''Mark a Token for splitting, into the specified orths. The attrs
+        will be applied to each subtoken.'''
+        self.splits.append((token.start_char, orths, attrs))
+
+    def __enter__(self):
+        self.merges = []
+        self.splits = []
+        return self
+
+    def __exit__(self, *args):
+        # Do the actual merging here
+        for start_char, end_char, attrs in self.merges:
+            start = token_by_start(self.doc.c, self.doc.length, start_char)
+            end = token_by_end(self.doc.c, self.doc.length, end_char)
+            _merge(self.doc, start, end+1, attrs)
+        for start_char, orths, attrs in self.splits:
+            raise NotImplementedError
+
+
+def _merge(Doc doc, int start, int end, attributes):
+    """Retokenize the document, such that the span at
+    `doc.text[start_idx : end_idx]` is merged into a single token. If
+    `start_idx` and `end_idx `do not mark start and end token boundaries,
+    the document remains unchanged.
+
+    start_idx (int): Character index of the start of the slice to merge.
+    end_idx (int): Character index after the end of the slice to merge.
+    **attributes: Attributes to assign to the merged token. By default,
+        attributes are inherited from the syntactic root of the span.
+    RETURNS (Token): The newly merged token, or `None` if the start and end
+        indices did not fall at token boundaries.
+    """
+    cdef Span span = doc[start:end]
+    cdef int start_char = span.start_char
+    cdef int end_char = span.end_char
+    # Get LexemeC for newly merged token
+    new_orth = ''.join([t.text_with_ws for t in span])
+    if span[-1].whitespace_:
+        new_orth = new_orth[:-len(span[-1].whitespace_)]
+    cdef const LexemeC* lex = doc.vocab.get(doc.mem, new_orth)
+    # House the new merged token where it starts
+    cdef TokenC* token = &doc.c[start]
+    token.spacy = doc.c[end-1].spacy
+    for attr_name, attr_value in attributes.items():
+        if attr_name == TAG:
+            doc.vocab.morphology.assign_tag(token, attr_value)
+        else:
+            Token.set_struct_attr(token, attr_name, attr_value)
+    # Make sure ent_iob remains consistent
+    if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
+        if token.ent_type == doc.c[end].ent_type:
+            token.ent_iob = 3
+        else:
+            # If they're not the same entity type, let them be two entities
+            doc.c[end].ent_iob = 3
+    # Begin by setting all the head indices to absolute token positions
+    # This is easier to work with for now than the offsets
+    # Before thinking of something simpler, beware the case where a
+    # dependency bridges over the entity. Here the alignment of the
+    # tokens changes.
+    span_root = span.root.i
+    token.dep = span.root.dep
+    # We update token.lex after keeping span root and dep, since
+    # setting token.lex will change span.start and span.end properties
+    # as it modifies the character offsets in the doc
+    token.lex = lex
+    for i in range(doc.length):
+        doc.c[i].head += i
+    # Set the head of the merged token, and its dep relation, from the Span
+    token.head = doc.c[span_root].head
+    # Adjust deps before shrinking tokens
+    # Tokens which point into the merged token should now point to it
+    # Subtract the offset from all tokens which point to >= end
+    offset = (end - start) - 1
+    for i in range(doc.length):
+        head_idx = doc.c[i].head
+        if start <= head_idx < end:
+            doc.c[i].head = start
+        elif head_idx >= end:
+            doc.c[i].head -= offset
+    # Now compress the token array
+    for i in range(end, doc.length):
+        doc.c[i - offset] = doc.c[i]
+    for i in range(doc.length - offset, doc.length):
+        memset(&doc.c[i], 0, sizeof(TokenC))
+        doc.c[i].lex = &EMPTY_LEXEME
+    doc.length -= offset
+    for i in range(doc.length):
+        # ...And, set heads back to a relative position
+        doc.c[i].head -= i
+    # Set the left/right children, left/right edges
+    set_children_from_heads(doc.c, doc.length)
+    # Clear the cached Python objects
+    # Return the merged Python object
+    return doc[start]
+
+
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index f34c455c6..63582646f 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -25,6 +25,8 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
 cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
 
 
+cdef int set_children_from_heads(TokenC* tokens, int length) except -1
+
 cdef class Doc:
     cdef readonly Pool mem
     cdef readonly Vocab vocab
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 1885dc872..c7eac15c0 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -34,6 +34,7 @@ from ..compat import is_config, copy_reg, pickle, basestring_
 from .. import about
 from .. import util
 from .underscore import Underscore
+from ._retokenize import Retokenizer
 
 DEF PADDING = 5
 
@@ -186,6 +187,20 @@ cdef class Doc:
     def _(self):
         return Underscore(Underscore.doc_extensions, self)
 
+    @property
+    def is_sentenced(self):
+        # Check if the document has sentence boundaries,
+        # i.e at least one tok has the sent_start in (-1, 1)
+        if 'sents' in self.user_hooks:
+            return True
+        if self.is_parsed:
+            return True
+        for i in range(self.length):
+            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
+                return True
+        else:
+            return False
+
     def __getitem__(self, object i):
         """Get a `Token` or `Span` object.
 
@@ -305,7 +320,7 @@ cdef class Doc:
                         break
                 else:
                     return 1.0
-
+ 
         if self.vector_norm == 0 or other.vector_norm == 0:
             return 0.0
         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
@@ -517,29 +532,23 @@ cdef class Doc:
             >>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
         """
         def __get__(self):
+            if not self.is_sentenced:
+                raise ValueError(
+                    "Sentence boundaries unset. You can add the 'sentencizer' "
+                    "component to the pipeline with: "
+                    "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
+                    "Alternatively, add the dependency parser, or set "
+                    "sentence boundaries by setting doc[i].sent_start")
             if 'sents' in self.user_hooks:
                 yield from self.user_hooks['sents'](self)
-                return
-
-            cdef int i
-            if not self.is_parsed:
+            else:
+                start = 0
                 for i in range(1, self.length):
-                    if self.c[i].sent_start != 0:
-                        break
-                else:
-                    raise ValueError(
-                        "Sentence boundaries unset. You can add the 'sentencizer' "
-                        "component to the pipeline with: "
-                        "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
-                        "Alternatively, add the dependency parser, or set "
-                        "sentence boundaries by setting doc[i].sent_start")
-            start = 0
-            for i in range(1, self.length):
-                if self.c[i].sent_start == 1:
-                    yield Span(self, start, i)
-                    start = i
-            if start != self.length:
-                yield Span(self, start, self.length)
+                    if self.c[i].sent_start == 1:
+                        yield Span(self, start, i)
+                        start = i
+                if start != self.length:
+                    yield Span(self, start, self.length)
 
     cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
         if self.length == 0:
@@ -559,9 +568,7 @@ cdef class Doc:
             t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
         t.l_edge = self.length
         t.r_edge = self.length
-        if t.lex.orth == 0:
-            raise ValueError("Invalid token: empty string ('') at position {}"
-                             .format(self.length))
+        assert t.lex.orth != 0
         t.spacy = has_space
         self.length += 1
         return t.idx + t.lex.length + t.spacy
@@ -882,6 +889,18 @@ cdef class Doc:
         else:
             self.tensor = xp.hstack((self.tensor, tensor))
 
+    def retokenize(self):
+        '''Context manager to handle retokenization of the Doc. 
+        Modifications to the Doc's tokenization are stored, and then
+        made all at once when the context manager exits. This is
+        much more efficient, and less error-prone.
+
+        All views of the Doc (Span and Token) created before the
+        retokenization are invalidated, although they may accidentally
+        continue to work.
+        '''
+        return Retokenizer(self)
+
     def merge(self, int start_idx, int end_idx, *args, **attributes):
         """Retokenize the document, such that the span at
         `doc.text[start_idx : end_idx]` is merged into a single token. If
@@ -935,66 +954,8 @@ cdef class Doc:
             return None
         # Currently we have the token index, we want the range-end index
         end += 1
-        cdef Span span = self[start:end]
-        # Get LexemeC for newly merged token
-        new_orth = ''.join([t.text_with_ws for t in span])
-        if span[-1].whitespace_:
-            new_orth = new_orth[:-len(span[-1].whitespace_)]
-        cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
-        # House the new merged token where it starts
-        cdef TokenC* token = &self.c[start]
-        token.spacy = self.c[end-1].spacy
-        for attr_name, attr_value in attributes.items():
-            if attr_name == TAG:
-                self.vocab.morphology.assign_tag(token, attr_value)
-            else:
-                Token.set_struct_attr(token, attr_name, attr_value)
-        # Make sure ent_iob remains consistent
-        if self.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
-            if token.ent_type == self.c[end].ent_type:
-                token.ent_iob = 3
-            else:
-                # If they're not the same entity type, let them be two entities
-                self.c[end].ent_iob = 3
-        # Begin by setting all the head indices to absolute token positions
-        # This is easier to work with for now than the offsets
-        # Before thinking of something simpler, beware the case where a
-        # dependency bridges over the entity. Here the alignment of the
-        # tokens changes.
-        span_root = span.root.i
-        token.dep = span.root.dep
-        # We update token.lex after keeping span root and dep, since
-        # setting token.lex will change span.start and span.end properties
-        # as it modifies the character offsets in the doc
-        token.lex = lex
-        for i in range(self.length):
-            self.c[i].head += i
-        # Set the head of the merged token, and its dep relation, from the Span
-        token.head = self.c[span_root].head
-        # Adjust deps before shrinking tokens
-        # Tokens which point into the merged token should now point to it
-        # Subtract the offset from all tokens which point to >= end
-        offset = (end - start) - 1
-        for i in range(self.length):
-            head_idx = self.c[i].head
-            if start <= head_idx < end:
-                self.c[i].head = start
-            elif head_idx >= end:
-                self.c[i].head -= offset
-        # Now compress the token array
-        for i in range(end, self.length):
-            self.c[i - offset] = self.c[i]
-        for i in range(self.length - offset, self.length):
-            memset(&self.c[i], 0, sizeof(TokenC))
-            self.c[i].lex = &EMPTY_LEXEME
-        self.length -= offset
-        for i in range(self.length):
-            # ...And, set heads back to a relative position
-            self.c[i].head -= i
-        # Set the left/right children, left/right edges
-        set_children_from_heads(self.c, self.length)
-        # Clear the cached Python objects
-        # Return the merged Python object
+        with self.retokenize() as retokenizer:
+            retokenizer.merge(self[start:end], attrs=attributes)
         return self[start]
 
     def print_tree(self, light=False, flat=False):