From 5651a0d052bcfd160b187828aa3d8d90652929fe Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 15 Feb 2019 10:29:44 +0100
Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=92=AB=20Replace=20{Doc,Span}.merge?=
 =?UTF-8?q?=20with=20Doc.retokenize=20(#3280)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add deprecation warning to Doc.merge and Span.merge

* Replace {Doc,Span}.merge with Doc.retokenize
---
 spacy/cli/ud/ud_run_test.py                   |  8 +-
 spacy/cli/ud/ud_train.py                      |  6 +-
 spacy/displacy/__init__.py                    | 20 ++--
 spacy/errors.py                               |  3 +
 spacy/pipeline/functions.py                   | 27 +++---
 spacy/tests/doc/test_doc_api.py               | 92 ++++---------------
 spacy/tests/doc/test_span_merge.py            | 61 ++++++------
 spacy/tests/lang/test_initialize.py           |  2 +-
 spacy/tests/matcher/test_matcher_api.py       |  4 +-
 spacy/tests/parser/test_parse.py              |  6 +-
 spacy/tests/regression/test_issue1-1000.py    | 17 ++--
 spacy/tests/regression/test_issue1501-2000.py |  3 +-
 spacy/tokens/doc.pyx                          |  1 +
 spacy/tokens/span.pyx                         |  2 +
 14 files changed, 100 insertions(+), 152 deletions(-)

diff --git a/spacy/cli/ud/ud_run_test.py b/spacy/cli/ud/ud_run_test.py
index 43140eb03..35c878721 100644
--- a/spacy/cli/ud/ud_run_test.py
+++ b/spacy/cli/ud/ud_run_test.py
@@ -112,10 +112,10 @@ def write_conllu(docs, file_):
     for i, doc in enumerate(docs):
         matches = merger(doc)
         spans = [doc[start : end + 1] for _, start, end in matches]
-        offsets = [(span.start_char, span.end_char) for span in spans]
-        for start_char, end_char in offsets:
-            doc.merge(start_char, end_char)
-        # TODO: This shuldn't be necessary? Should be handled in merge
+        with doc.retokenize() as retokenizer:
+            for span in spans:
+                retokenizer.merge(span)
+        # TODO: This shouldn't be necessary? Should be handled in merge
         for word in doc:
             if word.i == word.head.i:
                 word.dep_ = "ROOT"
diff --git a/spacy/cli/ud/ud_train.py b/spacy/cli/ud/ud_train.py
index 6c4fbb3eb..563fcfb87 100644
--- a/spacy/cli/ud/ud_train.py
+++ b/spacy/cli/ud/ud_train.py
@@ -217,9 +217,9 @@ def write_conllu(docs, file_):
     for i, doc in enumerate(docs):
         matches = merger(doc)
         spans = [doc[start : end + 1] for _, start, end in matches]
-        offsets = [(span.start_char, span.end_char) for span in spans]
-        for start_char, end_char in offsets:
-            doc.merge(start_char, end_char)
+        with doc.retokenize() as retokenizer:
+            for span in spans:
+                retokenizer.merge(span)
         file_.write("# newdoc id = {i}\n".format(i=i))
         for j, sent in enumerate(doc.sents):
             file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index 3a3cba708..f8886848d 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -107,8 +107,14 @@ def parse_deps(orig_doc, options={}):
     if not doc.is_parsed:
         user_warning(Warnings.W005)
     if options.get("collapse_phrases", False):
-        for np in list(doc.noun_chunks):
-            np.merge(tag=np.root.tag_, lemma=np.root.lemma_, ent_type=np.root.ent_type_)
+        with doc.retokenize() as retokenizer:
+            for np in list(doc.noun_chunks):
+                attrs = {
+                    "tag": np.root.tag_,
+                    "lemma": np.root.lemma_,
+                    "ent_type": np.root.ent_type_,
+                }
+                retokenizer.merge(np, attrs=attrs)
     if options.get("collapse_punct", True):
         spans = []
         for word in doc[:-1]:
@@ -119,11 +125,11 @@ def parse_deps(orig_doc, options={}):
             while end < len(doc) and doc[end].is_punct:
                 end += 1
             span = doc[start:end]
-            spans.append(
-                (span.start_char, span.end_char, word.tag_, word.lemma_, word.ent_type_)
-            )
-        for start, end, tag, lemma, ent_type in spans:
-            doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
+            spans.append((span, word.tag_, word.lemma_, word.ent_type_))
+        with doc.retokenize() as retokenizer:
+            for span, tag, lemma, ent_type in spans:
+                attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
+                retokenizer.merge(span, attrs=attrs)
     if options.get("fine_grained"):
         words = [{"text": w.text, "tag": w.tag_} for w in doc]
     else:
diff --git a/spacy/errors.py b/spacy/errors.py
index 37deb4560..f73e38c6a 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -67,6 +67,9 @@ class Warnings(object):
             "components are applied. To only create tokenized Doc objects, "
             "try using `nlp.make_doc(text)` or process all texts as a stream "
             "using `list(nlp.tokenizer.pipe(all_texts))`.")
+    W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
+            "efficient and less error-prone Doc.retokenize context manager "
+            "instead.")
 
 
 @add_codes
diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
index 23000d948..925f0e0fc 100644
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@@ -4,9 +4,6 @@ from __future__ import unicode_literals
 from ..matcher import Matcher
 
 
-# TODO: replace doc.merge with doc.retokenize
-
-
 def merge_noun_chunks(doc):
     """Merge noun chunks into a single token.
 
@@ -15,11 +12,10 @@ def merge_noun_chunks(doc):
     """
     if not doc.is_parsed:
         return doc
-    spans = [
-        (np.start_char, np.end_char, np.root.tag, np.root.dep) for np in doc.noun_chunks
-    ]
-    for start, end, tag, dep in spans:
-        doc.merge(start, end, tag=tag, dep=dep)
+    with doc.retokenize() as retokenizer:
+        for np in doc.noun_chunks:
+            attrs = {"tag": np.root.tag, "dep": np.root.dep}
+            retokenizer.merge(np, attrs=attrs)
     return doc
 
 
@@ -29,11 +25,10 @@ def merge_entities(doc):
     doc (Doc): The Doc object.
     RETURNS (Doc): The Doc object with merged noun entities.
     """
-    spans = [
-        (e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) for e in doc.ents
-    ]
-    for start, end, tag, dep, ent_type in spans:
-        doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
+    with doc.retokenize() as retokenizer:
+        for ent in doc.ents:
+            attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label}
+            retokenizer.merge(ent, attrs=attrs)
     return doc
 
 
@@ -42,7 +37,7 @@ def merge_subtokens(doc, label="subtok"):
     merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
     matches = merger(doc)
     spans = [doc[start : end + 1] for _, start, end in matches]
-    offsets = [(span.start_char, span.end_char) for span in spans]
-    for start_char, end_char in offsets:
-        doc.merge(start_char, end_char)
+    with doc.retokenize() as retokenizer:
+        for span in spans:
+            retokenizer.merge(span)
     return doc
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 77632d7c7..878ecd240 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -141,66 +141,13 @@ def test_doc_api_set_ents(en_tokenizer):
 
 def test_doc_api_merge(en_tokenizer):
     text = "WKRO played songs by the beach boys all night"
-
-    # merge 'The Beach Boys'
-    doc = en_tokenizer(text)
-    assert len(doc) == 9
-    doc.merge(
-        doc[4].idx,
-        doc[6].idx + len(doc[6]),
-        tag="NAMED",
-        lemma="LEMMA",
-        ent_type="TYPE",
-    )
-    assert len(doc) == 7
-    assert doc[4].text == "the beach boys"
-    assert doc[4].text_with_ws == "the beach boys "
-    assert doc[4].tag_ == "NAMED"
-
-    # merge 'all night'
-    doc = en_tokenizer(text)
-    assert len(doc) == 9
-    doc.merge(
-        doc[7].idx,
-        doc[8].idx + len(doc[8]),
-        tag="NAMED",
-        lemma="LEMMA",
-        ent_type="TYPE",
-    )
-    assert len(doc) == 8
-    assert doc[7].text == "all night"
-    assert doc[7].text_with_ws == "all night"
-
+    attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
     # merge both with bulk merge
     doc = en_tokenizer(text)
     assert len(doc) == 9
     with doc.retokenize() as retokenizer:
-        retokenizer.merge(
-            doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
-        )
-        retokenizer.merge(
-            doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
-        )
-
-    assert len(doc) == 6
-    assert doc[4].text == "the beach boys"
-    assert doc[4].text_with_ws == "the beach boys "
-    assert doc[4].tag_ == "NAMED"
-    assert doc[5].text == "all night"
-    assert doc[5].text_with_ws == "all night"
-    assert doc[5].tag_ == "NAMED"
-
-    # merge both with bulk merge
-    doc = en_tokenizer(text)
-    assert len(doc) == 9
-    with doc.retokenize() as retokenizer:
-        retokenizer.merge(
-            doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
-        )
-        retokenizer.merge(
-            doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
-        )
-
+        retokenizer.merge(doc[4:7], attrs=attrs)
+        retokenizer.merge(doc[7:9], attrs=attrs)
     assert len(doc) == 6
     assert doc[4].text == "the beach boys"
     assert doc[4].text_with_ws == "the beach boys "
@@ -213,16 +160,11 @@ def test_doc_api_merge(en_tokenizer):
 def test_doc_api_merge_children(en_tokenizer):
     """Test that attachments work correctly after merging."""
     text = "WKRO played songs by the beach boys all night"
+    attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
     doc = en_tokenizer(text)
     assert len(doc) == 9
-    doc.merge(
-        doc[4].idx,
-        doc[6].idx + len(doc[6]),
-        tag="NAMED",
-        lemma="LEMMA",
-        ent_type="TYPE",
-    )
-
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[4:7], attrs=attrs)
     for word in doc:
         if word.i < word.head.i:
             assert word in list(word.head.lefts)
@@ -233,8 +175,9 @@ def test_doc_api_merge_children(en_tokenizer):
 def test_doc_api_merge_hang(en_tokenizer):
     text = "through North and South Carolina"
     doc = en_tokenizer(text)
-    doc.merge(18, 32, tag="", lemma="", ent_type="ORG")
-    doc.merge(8, 32, tag="", lemma="", ent_type="ORG")
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"})
+        retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"})
 
 
 def test_doc_api_retokenizer(en_tokenizer):
@@ -287,21 +230,22 @@ def test_doc_api_runtime_error(en_tokenizer):
             "pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
             "ROOT", "amod", "dobj"]
     # fmt: on
-
     tokens = en_tokenizer(text)
     doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
-
     nps = []
     for np in doc.noun_chunks:
         while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
             np = np[1:]
         if len(np) > 1:
-            nps.append(
-                (np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_)
-            )
-    for np in nps:
-        start, end, tag, lemma, ent_type = np
-        doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
+            nps.append(np)
+    with doc.retokenize() as retokenizer:
+        for np in nps:
+            attrs = {
+                "tag": np.root.tag_,
+                "lemma": np.text,
+                "ent_type": np.root.ent_type_,
+            }
+            retokenizer.merge(np, attrs=attrs)
 
 
 def test_doc_api_right_edge(en_tokenizer):
diff --git a/spacy/tests/doc/test_span_merge.py b/spacy/tests/doc/test_span_merge.py
index 75a28380c..87d475f1f 100644
--- a/spacy/tests/doc/test_span_merge.py
+++ b/spacy/tests/doc/test_span_merge.py
@@ -16,17 +16,9 @@ def test_spans_merge_tokens(en_tokenizer):
     assert len(doc) == 4
     assert doc[0].head.text == "Angeles"
     assert doc[1].head.text == "start"
-    doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", ent_type="GPE")
-    assert len(doc) == 3
-    assert doc[0].text == "Los Angeles"
-    assert doc[0].head.text == "start"
-
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
-    assert len(doc) == 4
-    assert doc[0].head.text == "Angeles"
-    assert doc[1].head.text == "start"
-    doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", label="GPE")
-
+    with doc.retokenize() as retokenizer:
+        attrs = {"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"}
+        retokenizer.merge(doc[0:2], attrs=attrs)
     assert len(doc) == 3
     assert doc[0].text == "Los Angeles"
     assert doc[0].head.text == "start"
@@ -71,30 +63,28 @@ def test_span_np_merges(en_tokenizer):
     heads = [1, 0, 2, 1, -3, -1, -1, -1]
     tokens = en_tokenizer(text)
     doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
-
     assert doc[4].head.i == 1
-    doc.merge(
-        doc[2].idx, doc[4].idx + len(doc[4]), tag="NP", lemma="tool", ent_type="O"
-    )
+    with doc.retokenize() as retokenizer:
+        attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"}
+        retokenizer.merge(doc[2:5], attrs=attrs)
     assert doc[2].head.i == 1
 
     text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
     heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15]
     tokens = en_tokenizer(text)
     doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
-
-    ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) for e in doc.ents]
-    for start, end, label, lemma in ents:
-        merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label)
-        assert merged is not None, (start, end, label, lemma)
+    with doc.retokenize() as retokenizer:
+        for ent in doc.ents:
+            attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
+            retokenizer.merge(ent, attrs=attrs)
 
     text = "One test with entities like New York City so the ents list is not void"
     heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
     tokens = en_tokenizer(text)
     doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
-    for span in doc.ents:
-        merged = doc.merge()
-        assert merged is not None, (span.start, span.end, span.label_, span.lemma_)
+    with doc.retokenize() as retokenizer:
+        for ent in doc.ents:
+            retokenizer.merge(ent)
 
 
 def test_spans_entity_merge(en_tokenizer):
@@ -109,13 +99,11 @@ def test_spans_entity_merge(en_tokenizer):
         tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
     )
     assert len(doc) == 17
-    for ent in doc.ents:
-        label, lemma, type_ = (
-            ent.root.tag_,
-            ent.root.lemma_,
-            max(w.ent_type_ for w in ent),
-        )
-        ent.merge(label=label, lemma=lemma, ent_type=type_)
+    with doc.retokenize() as retokenizer:
+        for ent in doc.ents:
+            ent_type = max(w.ent_type_ for w in ent)
+            attrs = {"lemma": ent.root.lemma_, "ent_type": ent_type}
+            retokenizer.merge(ent, attrs=attrs)
     # check looping is ok
     assert len(doc) == 15
 
@@ -132,7 +120,8 @@ def test_spans_entity_merge_iob():
     assert doc[1].ent_iob_ == "I"
     assert doc[2].ent_iob_ == "I"
     assert doc[3].ent_iob_ == "B"
-    doc[0:1].merge()
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[0:1])
     assert doc[0].ent_iob_ == "B"
     assert doc[1].ent_iob_ == "I"
 
@@ -172,8 +161,10 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
     sent1, sent2 = list(doc.sents)
     init_len = len(sent1)
     init_len2 = len(sent2)
-    doc[0:2].merge(label="none", lemma="none", ent_type="none")
-    doc[-2:].merge(label="none", lemma="none", ent_type="none")
+    with doc.retokenize() as retokenizer:
+        attrs = {"lemma": "none", "ent_type": "none"}
+        retokenizer.merge(doc[0:2], attrs=attrs)
+        retokenizer.merge(doc[-2:], attrs=attrs)
     assert len(sent1) == init_len - 1
     assert len(sent2) == init_len2 - 1
 
@@ -191,5 +182,7 @@ def test_spans_subtree_size_check(en_tokenizer):
     doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     sent1 = list(doc.sents)[0]
     init_len = len(list(sent1.root.subtree))
-    doc[0:2].merge(label="none", lemma="none", ent_type="none")
+    with doc.retokenize() as retokenizer:
+        attrs = {"lemma": "none", "ent_type": "none"}
+        retokenizer.merge(doc[0:2], attrs=attrs)
     assert len(list(sent1.root.subtree)) == init_len - 1
diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py
index 587d15dd7..7b303397e 100644
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@@ -18,4 +18,4 @@ LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
 @pytest.mark.parametrize("lang", LANGUAGES)
 def test_lang_initialize(lang):
     """Test that languages can be initialized."""
-    lang_cls = get_lang_class(lang)()
+    lang_cls = get_lang_class(lang)()  # noqa: F841
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 9e87359d4..7f7ebfc73 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -46,7 +46,9 @@ def test_matcher_from_usage_docs(en_vocab):
         if doc.vocab.strings[match_id] == "HAPPY":
             doc.sentiment += 0.1
         span = doc[start:end]
-        token = span.merge()
+        with doc.retokenize() as retokenizer:
+            retokenizer.merge(span)
+        token = doc[start]
         token.vocab[token.text].norm_ = "happy emoji"
 
     matcher = Matcher(en_vocab)
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 04e31d649..c140cb485 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -66,9 +66,9 @@ def test_parser_merge_pp(en_tokenizer):
     doc = get_doc(
         tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags
     )
-    nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_) for np in doc.noun_chunks]
-    for start, end, lemma in nps:
-        doc.merge(start, end, label="NP", lemma=lemma)
+    with doc.retokenize() as retokenizer:
+        for np in doc.noun_chunks:
+            retokenizer.merge(np, attrs={"lemma": np.lemma_})
     assert doc[0].text == "A phrase"
     assert doc[1].text == "with"
     assert doc[2].text == "another phrase"
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index 1b75406cc..00024d74a 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -9,7 +9,7 @@ from spacy.symbols import POS, VERB, VerbForm_inf
 from spacy.vocab import Vocab
 from spacy.language import Language
 from spacy.lemmatizer import Lemmatizer
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
 
 from ..util import get_doc, make_tempdir
 
@@ -204,12 +204,13 @@ def test_issue615(en_tokenizer):
         on the last match."""
         if i != len(matches) - 1:
             return None
-        spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
-        for ent_id, label, span in spans:
-            span.merge(
-                tag="NNP" if label else span.root.tag_, lemma=span.text, label=label
-            )
-            doc.ents = doc.ents + ((label, span.start, span.end),)
+        spans = [Span(doc, start, end, label=label) for label, start, end in matches]
+        with doc.retokenize() as retokenizer:
+            for span in spans:
+                tag = "NNP" if span.label_ else span.root.tag_
+                attrs = {"tag": tag, "lemma": span.text}
+                retokenizer.merge(span, attrs=attrs)
+                doc.ents = doc.ents + (span,)
 
     text = "The golf club is broken"
     pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
@@ -410,7 +411,7 @@ def test_issue957(en_tokenizer):
     """
     # Skip test if pytest-timeout is not installed
     pytest.importorskip("pytest_timeout")
-    for punct in ['.', ',', '\'', '\"', ':', '?', '!', ';', '-']:
+    for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]:
         string = "0"
         for i in range(1, 100):
             string += punct + str(i)
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 1bb6cebe1..28ebcb0a9 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -86,7 +86,8 @@ def test_issue1547():
     words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
     doc = Doc(Vocab(), words=words)
     doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
-    doc[5:7].merge()
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[5:7])
     assert [ent.text for ent in doc.ents]
 
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 3b8d0daed..ba0801e34 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -898,6 +898,7 @@ cdef class Doc:
             indices did not fall at token boundaries.
         """
         cdef unicode tag, lemma, ent_type
+        deprecation_warning(Warnings.W013.format(obj="Doc"))
         if len(args) == 3:
             deprecation_warning(Warnings.W003)
             tag, lemma, ent_type = args
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 593e6ddec..a418fc13f 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -18,6 +18,7 @@ from ..attrs cimport *
 from ..lexeme cimport Lexeme
 from ..compat import is_config, basestring_
 from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
+from ..errors import deprecation_warning
 from .underscore import Underscore, get_ext_args
 
 
@@ -193,6 +194,7 @@ cdef class Span:
             attributes are inherited from the syntactic root token of the span.
         RETURNS (Token): The newly merged token.
         """
+        deprecation_warning(Warnings.W013.format(obj="Span"))
         return self.doc.merge(self.start_char, self.end_char, *args,
                               **attributes)
 

From c31a9dabd53de47aa3bda065d95944bb61ffec78 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 15 Feb 2019 10:29:59 +0100
Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=92=AB=20=20Add=20en/em=20dash=20to?=
 =?UTF-8?q?=20prefixes=20and=20suffixes=20=20(#3281)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Auto-format

* Add en/em dash to prefixes and suffixes
---
 spacy/lang/punctuation.py                | 17 +++++++++--------
 spacy/tests/regression/test_issue3277.py | 11 +++++++++++
 2 files changed, 20 insertions(+), 8 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue3277.py

diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py
index 2ec8c3e43..17e20fa0c 100644
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@@ -1,14 +1,13 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY, LIST_ICONS
-from .char_classes import HYPHENS
-from .char_classes import CURRENCY, UNITS
+from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
+from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
 from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 
 
 _prefixes = (
-    ["§", "%", "=", r"\+(?![0-9])"]
+    ["§", "%", "=", "—", "–", r"\+(?![0-9])"]
     + LIST_PUNCT
     + LIST_ELLIPSES
     + LIST_QUOTES
@@ -22,13 +21,15 @@ _suffixes = (
     + LIST_ELLIPSES
     + LIST_QUOTES
     + LIST_ICONS
-    + ["'s", "'S", "’s", "’S"]
+    + ["'s", "'S", "’s", "’S", "—", "–"]
     + [
         r"(?<=[0-9])\+",
         r"(?<=°[FfCcKk])\.",
         r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
         r"(?<=[0-9])(?:{u})".format(u=UNITS),
-        r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES),
+        r"(?<=[0-9{al}{e}(?:{q})])\.".format(
+            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
+        ),
         r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
     ]
 )
@@ -40,8 +41,8 @@ _infixes = (
         r"(?<=[0-9])[+\-\*^](?=[0-9-])",
         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
-        r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
-        r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA),
+        r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+        r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
     ]
 )
 
diff --git a/spacy/tests/regression/test_issue3277.py b/spacy/tests/regression/test_issue3277.py
new file mode 100644
index 000000000..88ea67774
--- /dev/null
+++ b/spacy/tests/regression/test_issue3277.py
@@ -0,0 +1,11 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+
+def test_issue3277(es_tokenizer):
+    """Test that hyphens are split correctly as prefixes."""
+    doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
+    assert len(doc) == 14
+    assert doc[0].text == "\u2014"
+    assert doc[5].text == "\u2013"
+    assert doc[9].text == "\u2013"