mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Fix realloc in retokenizer.split() (#4606)
Always realloc to a size larger than `doc.max_length` in `retokenizer.split()` (or cymem will throw errors).
This commit is contained in:
		
							parent
							
								
									f415e9b7d1
								
							
						
					
					
						commit
						91f89f9693
					
				| 
						 | 
					@ -183,3 +183,18 @@ def test_doc_retokenizer_split_lex_attrs(en_vocab):
 | 
				
			||||||
        retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
 | 
					        retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
 | 
				
			||||||
    assert doc[0].is_stop
 | 
					    assert doc[0].is_stop
 | 
				
			||||||
    assert not doc[1].is_stop
 | 
					    assert not doc[1].is_stop
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_doc_retokenizer_realloc(en_vocab):
 | 
				
			||||||
 | 
					    """#4604: realloc correctly when new tokens outnumber original tokens"""
 | 
				
			||||||
 | 
					    text = "Hyperglycemic adverse events following antipsychotic drug administration in the"
 | 
				
			||||||
 | 
					    doc = Doc(en_vocab, words=text.split()[:-1])
 | 
				
			||||||
 | 
					    with doc.retokenize() as retokenizer:
 | 
				
			||||||
 | 
					        token = doc[0]
 | 
				
			||||||
 | 
					        heads = [(token, 0)] * len(token)
 | 
				
			||||||
 | 
					        retokenizer.split(doc[token.i], list(token.text), heads=heads)
 | 
				
			||||||
 | 
					    doc = Doc(en_vocab, words=text.split())
 | 
				
			||||||
 | 
					    with doc.retokenize() as retokenizer:
 | 
				
			||||||
 | 
					        token = doc[0]
 | 
				
			||||||
 | 
					        heads = [(token, 0)] * len(token)
 | 
				
			||||||
 | 
					        retokenizer.split(doc[token.i], list(token.text), heads=heads)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -329,7 +329,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
 | 
				
			||||||
            doc.c[i].head += offset
 | 
					            doc.c[i].head += offset
 | 
				
			||||||
    # Double doc.c max_length if necessary (until big enough for all new tokens)
 | 
					    # Double doc.c max_length if necessary (until big enough for all new tokens)
 | 
				
			||||||
    while doc.length + nb_subtokens - 1 >= doc.max_length:
 | 
					    while doc.length + nb_subtokens - 1 >= doc.max_length:
 | 
				
			||||||
        doc._realloc(doc.length * 2)
 | 
					        doc._realloc(doc.max_length * 2)
 | 
				
			||||||
    # Move tokens after the split to create space for the new tokens
 | 
					    # Move tokens after the split to create space for the new tokens
 | 
				
			||||||
    doc.length = len(doc) + nb_subtokens -1
 | 
					    doc.length = len(doc) + nb_subtokens -1
 | 
				
			||||||
    to_process_tensor = (doc.tensor is not None and doc.tensor.size != 0)
 | 
					    to_process_tensor = (doc.tensor is not None and doc.tensor.size != 0)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user