💫 Replace {Doc,Span}.merge with Doc.retokenize (#3280)

* Add deprecation warning to Doc.merge and Span.merge * Replace {Doc,Span}.merge with Doc.retokenize
2025-07-15 02:32:37 +03:00 · 2019-02-15 10:29:44 +01:00 · 2019-02-15 10:29:44 +01:00 · 5651a0d052
commit 5651a0d052
parent f146121092
14 changed files with 100 additions and 152 deletions
--- a/spacy/cli/ud/ud_run_test.py
+++ b/spacy/cli/ud/ud_run_test.py
@ -112,10 +112,10 @@ def write_conllu(docs, file_):
    for i, doc in enumerate(docs):
        matches = merger(doc)
        spans = [doc[start : end + 1] for _, start, end in matches]
-        offsets = [(span.start_char, span.end_char) for span in spans]
+        with doc.retokenize() as retokenizer:
-        for start_char, end_char in offsets:
+            for span in spans:
-            doc.merge(start_char, end_char)
+                retokenizer.merge(span)
-        # TODO: This shuldn't be necessary? Should be handled in merge
+        # TODO: This shouldn't be necessary? Should be handled in merge
        for word in doc:
            if word.i == word.head.i:
                word.dep_ = "ROOT"
--- a/spacy/cli/ud/ud_train.py
+++ b/spacy/cli/ud/ud_train.py
@ -217,9 +217,9 @@ def write_conllu(docs, file_):
    for i, doc in enumerate(docs):
        matches = merger(doc)
        spans = [doc[start : end + 1] for _, start, end in matches]
-        offsets = [(span.start_char, span.end_char) for span in spans]
+        with doc.retokenize() as retokenizer:
-        for start_char, end_char in offsets:
+            for span in spans:
-            doc.merge(start_char, end_char)
+                retokenizer.merge(span)
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -107,8 +107,14 @@ def parse_deps(orig_doc, options={}):
    if not doc.is_parsed:
        user_warning(Warnings.W005)
    if options.get("collapse_phrases", False):
-        for np in list(doc.noun_chunks):
+        with doc.retokenize() as retokenizer:
-            np.merge(tag=np.root.tag_, lemma=np.root.lemma_, ent_type=np.root.ent_type_)
+            for np in list(doc.noun_chunks):
                attrs = {
                    "tag": np.root.tag_,
                    "lemma": np.root.lemma_,
                    "ent_type": np.root.ent_type_,
                }
                retokenizer.merge(np, attrs=attrs)
    if options.get("collapse_punct", True):
        spans = []
        for word in doc[:-1]:
@ -119,11 +125,11 @@ def parse_deps(orig_doc, options={}):
            while end < len(doc) and doc[end].is_punct:
                end += 1
            span = doc[start:end]
-            spans.append(
+            spans.append((span, word.tag_, word.lemma_, word.ent_type_))
-                (span.start_char, span.end_char, word.tag_, word.lemma_, word.ent_type_)
+        with doc.retokenize() as retokenizer:
-            )
+            for span, tag, lemma, ent_type in spans:
-        for start, end, tag, lemma, ent_type in spans:
+                attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
-            doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
+                retokenizer.merge(span, attrs=attrs)
    if options.get("fine_grained"):
        words = [{"text": w.text, "tag": w.tag_} for w in doc]
    else:
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -67,6 +67,9 @@ class Warnings(object):
            "components are applied. To only create tokenized Doc objects, "
            "try using `nlp.make_doc(text)` or process all texts as a stream "
            "using `list(nlp.tokenizer.pipe(all_texts))`.")
    W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
            "efficient and less error-prone Doc.retokenize context manager "
            "instead.")
@add_codes
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@ -4,9 +4,6 @@ from __future__ import unicode_literals
 from ..matcher import Matcher
 # TODO: replace doc.merge with doc.retokenize
 def merge_noun_chunks(doc):
    """Merge noun chunks into a single token.
@ -15,11 +12,10 @@ def merge_noun_chunks(doc):
    """
    if not doc.is_parsed:
        return doc
-    spans = [
+    with doc.retokenize() as retokenizer:
-        (np.start_char, np.end_char, np.root.tag, np.root.dep) for np in doc.noun_chunks
+        for np in doc.noun_chunks:
-    ]
+            attrs = {"tag": np.root.tag, "dep": np.root.dep}
-    for start, end, tag, dep in spans:
+            retokenizer.merge(np, attrs=attrs)
        doc.merge(start, end, tag=tag, dep=dep)
    return doc
@ -29,11 +25,10 @@ def merge_entities(doc):
    doc (Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged noun entities.
    """
-    spans = [
+    with doc.retokenize() as retokenizer:
-        (e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) for e in doc.ents
+        for ent in doc.ents:
-    ]
+            attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label}
-    for start, end, tag, dep, ent_type in spans:
+            retokenizer.merge(ent, attrs=attrs)
        doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
    return doc
@ -42,7 +37,7 @@ def merge_subtokens(doc, label="subtok"):
    merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
    matches = merger(doc)
    spans = [doc[start : end + 1] for _, start, end in matches]
-    offsets = [(span.start_char, span.end_char) for span in spans]
+    with doc.retokenize() as retokenizer:
-    for start_char, end_char in offsets:
+        for span in spans:
-        doc.merge(start_char, end_char)
+            retokenizer.merge(span)
    return doc
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -141,66 +141,13 @@ def test_doc_api_set_ents(en_tokenizer):
 def test_doc_api_merge(en_tokenizer):
    text = "WKRO played songs by the beach boys all night"
-
+    attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
    # merge 'The Beach Boys'
    doc = en_tokenizer(text)
    assert len(doc) == 9
    doc.merge(
        doc[4].idx,
        doc[6].idx + len(doc[6]),
        tag="NAMED",
        lemma="LEMMA",
        ent_type="TYPE",
    )
    assert len(doc) == 7
    assert doc[4].text == "the beach boys"
    assert doc[4].text_with_ws == "the beach boys "
    assert doc[4].tag_ == "NAMED"
    # merge 'all night'
    doc = en_tokenizer(text)
    assert len(doc) == 9
    doc.merge(
        doc[7].idx,
        doc[8].idx + len(doc[8]),
        tag="NAMED",
        lemma="LEMMA",
        ent_type="TYPE",
    )
    assert len(doc) == 8
    assert doc[7].text == "all night"
    assert doc[7].text_with_ws == "all night"
    # merge both with bulk merge
    doc = en_tokenizer(text)
    assert len(doc) == 9
    with doc.retokenize() as retokenizer:
-        retokenizer.merge(
+        retokenizer.merge(doc[4:7], attrs=attrs)
-            doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
+        retokenizer.merge(doc[7:9], attrs=attrs)
        )
        retokenizer.merge(
            doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
        )
    assert len(doc) == 6
    assert doc[4].text == "the beach boys"
    assert doc[4].text_with_ws == "the beach boys "
    assert doc[4].tag_ == "NAMED"
    assert doc[5].text == "all night"
    assert doc[5].text_with_ws == "all night"
    assert doc[5].tag_ == "NAMED"
    # merge both with bulk merge
    doc = en_tokenizer(text)
    assert len(doc) == 9
    with doc.retokenize() as retokenizer:
        retokenizer.merge(
            doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
        )
        retokenizer.merge(
            doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
        )
    assert len(doc) == 6
    assert doc[4].text == "the beach boys"
    assert doc[4].text_with_ws == "the beach boys "
@ -213,16 +160,11 @@ def test_doc_api_merge(en_tokenizer):
 def test_doc_api_merge_children(en_tokenizer):
    """Test that attachments work correctly after merging."""
    text = "WKRO played songs by the beach boys all night"
    attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
    doc = en_tokenizer(text)
    assert len(doc) == 9
-    doc.merge(
+    with doc.retokenize() as retokenizer:
-        doc[4].idx,
+        retokenizer.merge(doc[4:7], attrs=attrs)
        doc[6].idx + len(doc[6]),
        tag="NAMED",
        lemma="LEMMA",
        ent_type="TYPE",
    )
    for word in doc:
        if word.i < word.head.i:
            assert word in list(word.head.lefts)
@ -233,8 +175,9 @@ def test_doc_api_merge_children(en_tokenizer):
 def test_doc_api_merge_hang(en_tokenizer):
    text = "through North and South Carolina"
    doc = en_tokenizer(text)
-    doc.merge(18, 32, tag="", lemma="", ent_type="ORG")
+    with doc.retokenize() as retokenizer:
-    doc.merge(8, 32, tag="", lemma="", ent_type="ORG")
+        retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"})
        retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"})
 def test_doc_api_retokenizer(en_tokenizer):
@ -287,21 +230,22 @@ def test_doc_api_runtime_error(en_tokenizer):
            "pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
            "ROOT", "amod", "dobj"]
    # fmt: on
    tokens = en_tokenizer(text)
    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
    nps = []
    for np in doc.noun_chunks:
        while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
            np = np[1:]
        if len(np) > 1:
-            nps.append(
+            nps.append(np)
-                (np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_)
+    with doc.retokenize() as retokenizer:
-            )
+        for np in nps:
-    for np in nps:
+            attrs = {
-        start, end, tag, lemma, ent_type = np
+                "tag": np.root.tag_,
-        doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
+                "lemma": np.text,
                "ent_type": np.root.ent_type_,
            }
            retokenizer.merge(np, attrs=attrs)
 def test_doc_api_right_edge(en_tokenizer):
--- a/spacy/tests/doc/test_span_merge.py
+++ b/spacy/tests/doc/test_span_merge.py
@ -16,17 +16,9 @@ def test_spans_merge_tokens(en_tokenizer):
    assert len(doc) == 4
    assert doc[0].head.text == "Angeles"
    assert doc[1].head.text == "start"
-    doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", ent_type="GPE")
+    with doc.retokenize() as retokenizer:
-    assert len(doc) == 3
+        attrs = {"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"}
-    assert doc[0].text == "Los Angeles"
+        retokenizer.merge(doc[0:2], attrs=attrs)
    assert doc[0].head.text == "start"
    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
    assert len(doc) == 4
    assert doc[0].head.text == "Angeles"
    assert doc[1].head.text == "start"
    doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", label="GPE")
    assert len(doc) == 3
    assert doc[0].text == "Los Angeles"
    assert doc[0].head.text == "start"
@ -71,30 +63,28 @@ def test_span_np_merges(en_tokenizer):
    heads = [1, 0, 2, 1, -3, -1, -1, -1]
    tokens = en_tokenizer(text)
    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
    assert doc[4].head.i == 1
-    doc.merge(
+    with doc.retokenize() as retokenizer:
-        doc[2].idx, doc[4].idx + len(doc[4]), tag="NP", lemma="tool", ent_type="O"
+        attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"}
-    )
+        retokenizer.merge(doc[2:5], attrs=attrs)
    assert doc[2].head.i == 1
    text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
    heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15]
    tokens = en_tokenizer(text)
    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
-
+    with doc.retokenize() as retokenizer:
-    ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) for e in doc.ents]
+        for ent in doc.ents:
-    for start, end, label, lemma in ents:
+            attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
-        merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label)
+            retokenizer.merge(ent, attrs=attrs)
        assert merged is not None, (start, end, label, lemma)
    text = "One test with entities like New York City so the ents list is not void"
    heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
    tokens = en_tokenizer(text)
    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
-    for span in doc.ents:
+    with doc.retokenize() as retokenizer:
-        merged = doc.merge()
+        for ent in doc.ents:
-        assert merged is not None, (span.start, span.end, span.label_, span.lemma_)
+            retokenizer.merge(ent)
 def test_spans_entity_merge(en_tokenizer):
@ -109,13 +99,11 @@ def test_spans_entity_merge(en_tokenizer):
        tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
    )
    assert len(doc) == 17
-    for ent in doc.ents:
+    with doc.retokenize() as retokenizer:
-        label, lemma, type_ = (
+        for ent in doc.ents:
-            ent.root.tag_,
+            ent_type = max(w.ent_type_ for w in ent)
-            ent.root.lemma_,
+            attrs = {"lemma": ent.root.lemma_, "ent_type": ent_type}
-            max(w.ent_type_ for w in ent),
+            retokenizer.merge(ent, attrs=attrs)
        )
        ent.merge(label=label, lemma=lemma, ent_type=type_)
    # check looping is ok
    assert len(doc) == 15
@ -132,7 +120,8 @@ def test_spans_entity_merge_iob():
    assert doc[1].ent_iob_ == "I"
    assert doc[2].ent_iob_ == "I"
    assert doc[3].ent_iob_ == "B"
-    doc[0:1].merge()
+    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:1])
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
@ -172,8 +161,10 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
    sent1, sent2 = list(doc.sents)
    init_len = len(sent1)
    init_len2 = len(sent2)
-    doc[0:2].merge(label="none", lemma="none", ent_type="none")
+    with doc.retokenize() as retokenizer:
-    doc[-2:].merge(label="none", lemma="none", ent_type="none")
+        attrs = {"lemma": "none", "ent_type": "none"}
        retokenizer.merge(doc[0:2], attrs=attrs)
        retokenizer.merge(doc[-2:], attrs=attrs)
    assert len(sent1) == init_len - 1
    assert len(sent2) == init_len2 - 1
@ -191,5 +182,7 @@ def test_spans_subtree_size_check(en_tokenizer):
    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
    sent1 = list(doc.sents)[0]
    init_len = len(list(sent1.root.subtree))
-    doc[0:2].merge(label="none", lemma="none", ent_type="none")
+    with doc.retokenize() as retokenizer:
        attrs = {"lemma": "none", "ent_type": "none"}
        retokenizer.merge(doc[0:2], attrs=attrs)
    assert len(list(sent1.root.subtree)) == init_len - 1
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@ -18,4 +18,4 @@ LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
@pytest.mark.parametrize("lang", LANGUAGES)
 def test_lang_initialize(lang):
    """Test that languages can be initialized."""
-    lang_cls = get_lang_class(lang)()
+    lang_cls = get_lang_class(lang)()  # noqa: F841
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -46,7 +46,9 @@ def test_matcher_from_usage_docs(en_vocab):
        if doc.vocab.strings[match_id] == "HAPPY":
            doc.sentiment += 0.1
        span = doc[start:end]
-        token = span.merge()
+        with doc.retokenize() as retokenizer:
            retokenizer.merge(span)
        token = doc[start]
        token.vocab[token.text].norm_ = "happy emoji"
    matcher = Matcher(en_vocab)
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -66,9 +66,9 @@ def test_parser_merge_pp(en_tokenizer):
    doc = get_doc(
        tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags
    )
-    nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_) for np in doc.noun_chunks]
+    with doc.retokenize() as retokenizer:
-    for start, end, lemma in nps:
+        for np in doc.noun_chunks:
-        doc.merge(start, end, label="NP", lemma=lemma)
+            retokenizer.merge(np, attrs={"lemma": np.lemma_})
    assert doc[0].text == "A phrase"
    assert doc[1].text == "with"
    assert doc[2].text == "another phrase"
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@ -9,7 +9,7 @@ from spacy.symbols import POS, VERB, VerbForm_inf
 from spacy.vocab import Vocab
 from spacy.language import Language
 from spacy.lemmatizer import Lemmatizer
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
 from ..util import get_doc, make_tempdir
@ -204,12 +204,13 @@ def test_issue615(en_tokenizer):
        on the last match."""
        if i != len(matches) - 1:
            return None
-        spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
+        spans = [Span(doc, start, end, label=label) for label, start, end in matches]
-        for ent_id, label, span in spans:
+        with doc.retokenize() as retokenizer:
-            span.merge(
+            for span in spans:
-                tag="NNP" if label else span.root.tag_, lemma=span.text, label=label
+                tag = "NNP" if span.label_ else span.root.tag_
-            )
+                attrs = {"tag": tag, "lemma": span.text}
-            doc.ents = doc.ents + ((label, span.start, span.end),)
+                retokenizer.merge(span, attrs=attrs)
                doc.ents = doc.ents + (span,)
    text = "The golf club is broken"
    pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
@ -410,7 +411,7 @@ def test_issue957(en_tokenizer):
    """
    # Skip test if pytest-timeout is not installed
    pytest.importorskip("pytest_timeout")
-    for punct in ['.', ',', '\'', '\"', ':', '?', '!', ';', '-']:
+    for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]:
        string = "0"
        for i in range(1, 100):
            string += punct + str(i)
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -86,7 +86,8 @@ def test_issue1547():
    words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
-    doc[5:7].merge()
+    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[5:7])
    assert [ent.text for ent in doc.ents]
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -898,6 +898,7 @@ cdef class Doc:
            indices did not fall at token boundaries.
        """
        cdef unicode tag, lemma, ent_type
        deprecation_warning(Warnings.W013.format(obj="Doc"))
        if len(args) == 3:
            deprecation_warning(Warnings.W003)
            tag, lemma, ent_type = args
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -18,6 +18,7 @@ from ..attrs cimport *
 from ..lexeme cimport Lexeme
 from ..compat import is_config, basestring_
 from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
 from ..errors import deprecation_warning
 from .underscore import Underscore, get_ext_args
@ -193,6 +194,7 @@ cdef class Span:
            attributes are inherited from the syntactic root token of the span.
        RETURNS (Token): The newly merged token.
        """
        deprecation_warning(Warnings.W013.format(obj="Span"))
        return self.doc.merge(self.start_char, self.end_char, *args,
                              **attributes)