Keep sent starts without parse in retokenization (#7424)

In the retokenizer, only reset sent starts (with
`set_children_from_head`) if the doc is parsed. If there is no parse,
merged tokens have the unset `token.is_sent_start == None` by default after
retokenization.
This commit is contained in:
Adriane Boyd 2021-03-29 13:32:00 +02:00 committed by GitHub
parent faed54d659
commit d59f968d08
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 2 deletions

View File

@ -452,3 +452,30 @@ def test_retokenize_disallow_zero_length(en_vocab):
with pytest.raises(ValueError): with pytest.raises(ValueError):
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
retokenizer.merge(doc[1:1]) retokenizer.merge(doc[1:1])
def test_doc_retokenize_merge_without_parse_keeps_sents(en_tokenizer):
text = "displaCy is a parse tool built with Javascript"
sent_starts = [1, 0, 0, 0, 1, 0, 0, 0]
tokens = en_tokenizer(text)
# merging within a sentence keeps all sentence boundaries
doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
assert len(list(doc.sents)) == 2
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[1:3])
assert len(list(doc.sents)) == 2
# merging over a sentence boundary unsets it by default
doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
assert len(list(doc.sents)) == 2
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[3:6])
assert doc[3].is_sent_start == None
# merging over a sentence boundary and setting sent_start
doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
assert len(list(doc.sents)) == 2
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[3:6], attrs={"sent_start": True})
assert len(list(doc.sents)) == 2

View File

@ -281,7 +281,8 @@ def _merge(Doc doc, merges):
for i in range(doc.length): for i in range(doc.length):
doc.c[i].head -= i doc.c[i].head -= i
# Set the left/right children, left/right edges # Set the left/right children, left/right edges
set_children_from_heads(doc.c, 0, doc.length) if doc.has_annotation("DEP"):
set_children_from_heads(doc.c, 0, doc.length)
# Make sure ent_iob remains consistent # Make sure ent_iob remains consistent
make_iob_consistent(doc.c, doc.length) make_iob_consistent(doc.c, doc.length)
# Return the merged Python object # Return the merged Python object
@ -392,7 +393,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
for i in range(doc.length): for i in range(doc.length):
doc.c[i].head -= i doc.c[i].head -= i
# set children from head # set children from head
set_children_from_heads(doc.c, 0, doc.length) if doc.has_annotation("DEP"):
set_children_from_heads(doc.c, 0, doc.length)
def _validate_extensions(extensions): def _validate_extensions(extensions):