From d59f968d0837fe7bda04bb85e26f05aedbbd2133 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 29 Mar 2021 13:32:00 +0200 Subject: [PATCH] Keep sent starts without parse in retokenization (#7424) In the retokenizer, only reset sent starts (with `set_children_from_head`) if the doc is parsed. If there is no parse, merged tokens have the unset `token.is_sent_start == None` by default after retokenization. --- spacy/tests/doc/test_retokenize_merge.py | 27 ++++++++++++++++++++++++ spacy/tokens/_retokenize.pyx | 6 ++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 48cd33890..36fa3c15d 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -452,3 +452,30 @@ def test_retokenize_disallow_zero_length(en_vocab): with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.merge(doc[1:1]) + + +def test_doc_retokenize_merge_without_parse_keeps_sents(en_tokenizer): + text = "displaCy is a parse tool built with Javascript" + sent_starts = [1, 0, 0, 0, 1, 0, 0, 0] + tokens = en_tokenizer(text) + + # merging within a sentence keeps all sentence boundaries + doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts) + assert len(list(doc.sents)) == 2 + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[1:3]) + assert len(list(doc.sents)) == 2 + + # merging over a sentence boundary unsets it by default + doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts) + assert len(list(doc.sents)) == 2 + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[3:6]) + assert doc[3].is_sent_start == None + + # merging over a sentence boundary and setting sent_start + doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts) + assert len(list(doc.sents)) == 2 + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[3:6], attrs={"sent_start": True}) + assert len(list(doc.sents)) == 2 diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 3cb2965a9..5c7523667 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -281,7 +281,8 @@ def _merge(Doc doc, merges): for i in range(doc.length): doc.c[i].head -= i # Set the left/right children, left/right edges - set_children_from_heads(doc.c, 0, doc.length) + if doc.has_annotation("DEP"): + set_children_from_heads(doc.c, 0, doc.length) # Make sure ent_iob remains consistent make_iob_consistent(doc.c, doc.length) # Return the merged Python object @@ -392,7 +393,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs): for i in range(doc.length): doc.c[i].head -= i # set children from head - set_children_from_heads(doc.c, 0, doc.length) + if doc.has_annotation("DEP"): + set_children_from_heads(doc.c, 0, doc.length) def _validate_extensions(extensions):