mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Merge pull request #5485 from adrianeboyd/bugfix/retokenizer-merge-0-length-5450
Disallow merging 0-length spans
This commit is contained in:
commit
8cb16c7120
|
@ -567,6 +567,7 @@ class Errors(object):
|
|||
E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
|
||||
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
||||
"table, which contains {n_rows} vectors.")
|
||||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -425,3 +425,10 @@ def test_retokenize_skip_duplicates(en_vocab):
|
|||
retokenizer.merge(doc[0:2])
|
||||
assert len(doc) == 2
|
||||
assert doc[0].text == "hello world"
|
||||
|
||||
|
||||
def test_retokenize_disallow_zero_length(en_vocab):
|
||||
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
||||
with pytest.raises(ValueError):
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[1:1])
|
||||
|
|
|
@ -55,6 +55,8 @@ cdef class Retokenizer:
|
|||
"""
|
||||
if (span.start, span.end) in self._spans_to_merge:
|
||||
return
|
||||
if span.end - span.start <= 0:
|
||||
raise ValueError(Errors.E199.format(start=span.start, end=span.end))
|
||||
for token in span:
|
||||
if token.i in self.tokens_to_merge:
|
||||
raise ValueError(Errors.E102.format(token=repr(token)))
|
||||
|
|
Loading…
Reference in New Issue
Block a user