mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-27 17:12:54 +03:00
Keep sent starts without parse in retokenization (#7424)
In the retokenizer, only reset sent starts (with `set_children_from_head`) if the doc is parsed. If there is no parse, merged tokens have the unset `token.is_sent_start == None` by default after retokenization.
This commit is contained in:
parent
faed54d659
commit
d59f968d08
|
@ -452,3 +452,30 @@ def test_retokenize_disallow_zero_length(en_vocab):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc[1:1])
|
retokenizer.merge(doc[1:1])
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_retokenize_merge_without_parse_keeps_sents(en_tokenizer):
|
||||||
|
text = "displaCy is a parse tool built with Javascript"
|
||||||
|
sent_starts = [1, 0, 0, 0, 1, 0, 0, 0]
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
|
||||||
|
# merging within a sentence keeps all sentence boundaries
|
||||||
|
doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
|
||||||
|
assert len(list(doc.sents)) == 2
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[1:3])
|
||||||
|
assert len(list(doc.sents)) == 2
|
||||||
|
|
||||||
|
# merging over a sentence boundary unsets it by default
|
||||||
|
doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
|
||||||
|
assert len(list(doc.sents)) == 2
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[3:6])
|
||||||
|
assert doc[3].is_sent_start == None
|
||||||
|
|
||||||
|
# merging over a sentence boundary and setting sent_start
|
||||||
|
doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
|
||||||
|
assert len(list(doc.sents)) == 2
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[3:6], attrs={"sent_start": True})
|
||||||
|
assert len(list(doc.sents)) == 2
|
||||||
|
|
|
@ -281,7 +281,8 @@ def _merge(Doc doc, merges):
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
doc.c[i].head -= i
|
doc.c[i].head -= i
|
||||||
# Set the left/right children, left/right edges
|
# Set the left/right children, left/right edges
|
||||||
set_children_from_heads(doc.c, 0, doc.length)
|
if doc.has_annotation("DEP"):
|
||||||
|
set_children_from_heads(doc.c, 0, doc.length)
|
||||||
# Make sure ent_iob remains consistent
|
# Make sure ent_iob remains consistent
|
||||||
make_iob_consistent(doc.c, doc.length)
|
make_iob_consistent(doc.c, doc.length)
|
||||||
# Return the merged Python object
|
# Return the merged Python object
|
||||||
|
@ -392,7 +393,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
doc.c[i].head -= i
|
doc.c[i].head -= i
|
||||||
# set children from head
|
# set children from head
|
||||||
set_children_from_heads(doc.c, 0, doc.length)
|
if doc.has_annotation("DEP"):
|
||||||
|
set_children_from_heads(doc.c, 0, doc.length)
|
||||||
|
|
||||||
|
|
||||||
def _validate_extensions(extensions):
|
def _validate_extensions(extensions):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user