mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Fix spaces in Doc.from_docs for empty docs (#10052)
Fix spaces in `Doc.from_docs(ensure_whitespace=True)` for cases where an doc ending in whitespace is followed by an empty doc.
This commit is contained in:
		
							parent
							
								
									c28e33637b
								
							
						
					
					
						commit
						4dfd559e55
					
				|  | @ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): | ||||||
|         "Merging the docs is fun.", |         "Merging the docs is fun.", | ||||||
|         "", |         "", | ||||||
|         "They don't think alike. ", |         "They don't think alike. ", | ||||||
|  |         "", | ||||||
|         "Another doc.", |         "Another doc.", | ||||||
|     ] |     ] | ||||||
|     en_texts_without_empty = [t for t in en_texts if len(t)] |     en_texts_without_empty = [t for t in en_texts if len(t)] | ||||||
|  | @ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): | ||||||
|     en_docs = [en_tokenizer(text) for text in en_texts] |     en_docs = [en_tokenizer(text) for text in en_texts] | ||||||
|     en_docs[0].spans["group"] = [en_docs[0][1:4]] |     en_docs[0].spans["group"] = [en_docs[0][1:4]] | ||||||
|     en_docs[2].spans["group"] = [en_docs[2][1:4]] |     en_docs[2].spans["group"] = [en_docs[2][1:4]] | ||||||
|     en_docs[3].spans["group"] = [en_docs[3][0:1]] |     en_docs[4].spans["group"] = [en_docs[4][0:1]] | ||||||
|     span_group_texts = sorted( |     span_group_texts = sorted( | ||||||
|         [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text] |         [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text] | ||||||
|     ) |     ) | ||||||
|     de_doc = de_tokenizer(de_text) |     de_doc = de_tokenizer(de_text) | ||||||
|     Token.set_extension("is_ambiguous", default=False) |     Token.set_extension("is_ambiguous", default=False) | ||||||
|  |  | ||||||
|  | @ -1183,7 +1183,7 @@ cdef class Doc: | ||||||
|                 token_offset = -1 |                 token_offset = -1 | ||||||
|                 for doc in docs[:-1]: |                 for doc in docs[:-1]: | ||||||
|                     token_offset += len(doc) |                     token_offset += len(doc) | ||||||
|                     if not (len(doc) > 0 and doc[-1].is_space): |                     if len(doc) > 0 and not doc[-1].is_space: | ||||||
|                         concat_spaces[token_offset] = True |                         concat_spaces[token_offset] = True | ||||||
| 
 | 
 | ||||||
|         concat_array = numpy.concatenate(arrays) |         concat_array = numpy.concatenate(arrays) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user