Keep sent starts without parse in retokenization (#7424)

In the retokenizer, only reset sent starts (with `set_children_from_head`) if the doc is parsed. If there is no parse, merged tokens have the unset `token.is_sent_start == None` by default after retokenization.
2025-12-17 15:14:38 +03:00 · 2021-03-29 13:32:00 +02:00 · 2021-03-29 13:32:00 +02:00 · d59f968d08
commit d59f968d08
parent faed54d659
2 changed files with 31 additions and 2 deletions
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -452,3 +452,30 @@ def test_retokenize_disallow_zero_length(en_vocab):
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[1:1])
 def test_doc_retokenize_merge_without_parse_keeps_sents(en_tokenizer):
    text = "displaCy is a parse tool built with Javascript"
    sent_starts = [1, 0, 0, 0, 1, 0, 0, 0]
    tokens = en_tokenizer(text)
    # merging within a sentence keeps all sentence boundaries
    doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
    assert len(list(doc.sents)) == 2
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[1:3])
    assert len(list(doc.sents)) == 2
    # merging over a sentence boundary unsets it by default
    doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
    assert len(list(doc.sents)) == 2
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[3:6])
    assert doc[3].is_sent_start == None
    # merging over a sentence boundary and setting sent_start
    doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
    assert len(list(doc.sents)) == 2
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[3:6], attrs={"sent_start": True})
    assert len(list(doc.sents)) == 2
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -281,7 +281,8 @@ def _merge(Doc doc, merges):
    for i in range(doc.length):
        doc.c[i].head -= i
    # Set the left/right children, left/right edges
-    set_children_from_heads(doc.c, 0, doc.length)
+    if doc.has_annotation("DEP"):
        set_children_from_heads(doc.c, 0, doc.length)
    # Make sure ent_iob remains consistent
    make_iob_consistent(doc.c, doc.length)
    # Return the merged Python object
@ -392,7 +393,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
    for i in range(doc.length):
        doc.c[i].head -= i
    # set children from head
-    set_children_from_heads(doc.c, 0, doc.length)
+    if doc.has_annotation("DEP"):
        set_children_from_heads(doc.c, 0, doc.length)
 def _validate_extensions(extensions):