mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge pull request #5485 from adrianeboyd/bugfix/retokenizer-merge-0-length-5450
Disallow merging 0-length spans
This commit is contained in:
		
						commit
						8cb16c7120
					
				| 
						 | 
					@ -567,6 +567,7 @@ class Errors(object):
 | 
				
			||||||
    E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
 | 
					    E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
 | 
				
			||||||
    E198 = ("Unable to return {n} most similar vectors for the current vectors "
 | 
					    E198 = ("Unable to return {n} most similar vectors for the current vectors "
 | 
				
			||||||
            "table, which contains {n_rows} vectors.")
 | 
					            "table, which contains {n_rows} vectors.")
 | 
				
			||||||
 | 
					    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -425,3 +425,10 @@ def test_retokenize_skip_duplicates(en_vocab):
 | 
				
			||||||
        retokenizer.merge(doc[0:2])
 | 
					        retokenizer.merge(doc[0:2])
 | 
				
			||||||
    assert len(doc) == 2
 | 
					    assert len(doc) == 2
 | 
				
			||||||
    assert doc[0].text == "hello world"
 | 
					    assert doc[0].text == "hello world"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_retokenize_disallow_zero_length(en_vocab):
 | 
				
			||||||
 | 
					    doc = Doc(en_vocab, words=["hello", "world", "!"])
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        with doc.retokenize() as retokenizer:
 | 
				
			||||||
 | 
					            retokenizer.merge(doc[1:1])
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -55,6 +55,8 @@ cdef class Retokenizer:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        if (span.start, span.end) in self._spans_to_merge:
 | 
					        if (span.start, span.end) in self._spans_to_merge:
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
 | 
					        if span.end - span.start <= 0:
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E199.format(start=span.start, end=span.end))
 | 
				
			||||||
        for token in span:
 | 
					        for token in span:
 | 
				
			||||||
            if token.i in self.tokens_to_merge:
 | 
					            if token.i in self.tokens_to_merge:
 | 
				
			||||||
                raise ValueError(Errors.E102.format(token=repr(token)))
 | 
					                raise ValueError(Errors.E102.format(token=repr(token)))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user