mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Disallow merging 0-length spans
This commit is contained in:
parent
93c4d13588
commit
71fe61fdcd
|
@ -567,6 +567,7 @@ class Errors(object):
|
||||||
E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
|
E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
|
||||||
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
||||||
"table, which contains {n_rows} vectors.")
|
"table, which contains {n_rows} vectors.")
|
||||||
|
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -425,3 +425,10 @@ def test_retokenize_skip_duplicates(en_vocab):
|
||||||
retokenizer.merge(doc[0:2])
|
retokenizer.merge(doc[0:2])
|
||||||
assert len(doc) == 2
|
assert len(doc) == 2
|
||||||
assert doc[0].text == "hello world"
|
assert doc[0].text == "hello world"
|
||||||
|
|
||||||
|
|
||||||
|
def test_retokenize_disallow_zero_length(en_vocab):
|
||||||
|
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[1:1])
|
||||||
|
|
|
@ -55,6 +55,8 @@ cdef class Retokenizer:
|
||||||
"""
|
"""
|
||||||
if (span.start, span.end) in self._spans_to_merge:
|
if (span.start, span.end) in self._spans_to_merge:
|
||||||
return
|
return
|
||||||
|
if span.end - span.start <= 0:
|
||||||
|
raise ValueError(Errors.E199.format(start=span.start, end=span.end))
|
||||||
for token in span:
|
for token in span:
|
||||||
if token.i in self.tokens_to_merge:
|
if token.i in self.tokens_to_merge:
|
||||||
raise ValueError(Errors.E102.format(token=repr(token)))
|
raise ValueError(Errors.E102.format(token=repr(token)))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user