From 71fe61fdcd6c04de739391251bb346ba1de94e4e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 22 May 2020 10:14:34 +0200 Subject: [PATCH] Disallow merging 0-length spans --- spacy/errors.py | 1 + spacy/tests/doc/test_retokenize_merge.py | 7 +++++++ spacy/tokens/_retokenize.pyx | 2 ++ 3 files changed, 10 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index aca94d64e..6d92545d7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -567,6 +567,7 @@ class Errors(object): E197 = ("Row out of bounds, unable to add row {row} for key {key}.") E198 = ("Unable to return {n} most similar vectors for the current vectors " "table, which contains {n_rows} vectors.") + E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") @add_codes diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 5bdf78f39..636b7bb14 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -425,3 +425,10 @@ def test_retokenize_skip_duplicates(en_vocab): retokenizer.merge(doc[0:2]) assert len(doc) == 2 assert doc[0].text == "hello world" + + +def test_retokenize_disallow_zero_length(en_vocab): + doc = Doc(en_vocab, words=["hello", "world", "!"]) + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[1:1]) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 512ad73bc..ce8e510d6 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -55,6 +55,8 @@ cdef class Retokenizer: """ if (span.start, span.end) in self._spans_to_merge: return + if span.end - span.start <= 0: + raise ValueError(Errors.E199.format(start=span.start, end=span.end)) for token in span: if token.i in self.tokens_to_merge: raise ValueError(Errors.E102.format(token=repr(token)))