mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Fix Docs.from_docs for all empty docs (#8009)
This commit is contained in:
		
							parent
							
								
									debaab7021
								
							
						
					
					
						commit
						a71194362f
					
				| 
						 | 
					@ -411,6 +411,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
 | 
				
			||||||
    assert "group" in m_doc.spans
 | 
					    assert "group" in m_doc.spans
 | 
				
			||||||
    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
 | 
					    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # can merge empty docs
 | 
				
			||||||
 | 
					    doc = Doc.from_docs([en_tokenizer("")] * 10)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_doc_api_from_docs_ents(en_tokenizer):
 | 
					def test_doc_api_from_docs_ents(en_tokenizer):
 | 
				
			||||||
    texts = ["Merging the docs is fun.", "They don't think alike."]
 | 
					    texts = ["Merging the docs is fun.", "They don't think alike."]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1158,11 +1158,12 @@ cdef class Doc:
 | 
				
			||||||
            for i, array in enumerate(arrays[:-1]):
 | 
					            for i, array in enumerate(arrays[:-1]):
 | 
				
			||||||
                if len(array) > 0 and not docs[i][-1].is_space:
 | 
					                if len(array) > 0 and not docs[i][-1].is_space:
 | 
				
			||||||
                    array[-1][spacy_index] = 1
 | 
					                    array[-1][spacy_index] = 1
 | 
				
			||||||
            token_offset = -1
 | 
					            if len(concat_spaces) > 0:
 | 
				
			||||||
            for doc in docs[:-1]:
 | 
					                token_offset = -1
 | 
				
			||||||
                token_offset += len(doc)
 | 
					                for doc in docs[:-1]:
 | 
				
			||||||
                if not (len(doc) > 0 and doc[-1].is_space):
 | 
					                    token_offset += len(doc)
 | 
				
			||||||
                    concat_spaces[token_offset] = True
 | 
					                    if not (len(doc) > 0 and doc[-1].is_space):
 | 
				
			||||||
 | 
					                        concat_spaces[token_offset] = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        concat_array = numpy.concatenate(arrays)
 | 
					        concat_array = numpy.concatenate(arrays)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user