mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Fix formatting and typo (closes #967)
This commit is contained in:
		
							parent
							
								
									734b0a4e4a
								
							
						
					
					
						commit
						e7ae3b7cc2
					
				|  | @ -1,7 +1,8 @@ | ||||||
| '''Print part-of-speech tagged, true-cased, (very roughly) sentence-separated | """ | ||||||
|  | Print part-of-speech tagged, true-cased, (very roughly) sentence-separated | ||||||
| text, with each "sentence" on a newline, and spaces between tokens. Supports | text, with each "sentence" on a newline, and spaces between tokens. Supports | ||||||
| multi-processing. | multi-processing. | ||||||
| ''' | """ | ||||||
| from __future__ import print_function, unicode_literals, division | from __future__ import print_function, unicode_literals, division | ||||||
| import io | import io | ||||||
| import bz2 | import bz2 | ||||||
|  | @ -22,14 +23,14 @@ def parallelize(func, iterator, n_jobs, extra): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def iter_texts_from_json_bz2(loc): | def iter_texts_from_json_bz2(loc): | ||||||
|     ''' |     """ | ||||||
|     Iterator of unicode strings, one per document (here, a comment). |     Iterator of unicode strings, one per document (here, a comment). | ||||||
|      |      | ||||||
|     Expects a a path to a BZ2 file, which should be new-line delimited JSON. The |     Expects a a path to a BZ2 file, which should be new-line delimited JSON. The | ||||||
|     document text should be in a string field titled 'body'. |     document text should be in a string field titled 'body'. | ||||||
| 
 | 
 | ||||||
|     This is the data format of the Reddit comments corpus. |     This is the data format of the Reddit comments corpus. | ||||||
|     ''' |     """ | ||||||
|     with bz2.BZ2File(loc) as file_: |     with bz2.BZ2File(loc) as file_: | ||||||
|         for i, line in enumerate(file_): |         for i, line in enumerate(file_): | ||||||
|             yield ujson.loads(line)['body'] |             yield ujson.loads(line)['body'] | ||||||
|  | @ -80,7 +81,7 @@ def is_sent_begin(word): | ||||||
| def main(in_loc, out_dir, n_workers=4, batch_size=100000): | def main(in_loc, out_dir, n_workers=4, batch_size=100000): | ||||||
|     if not path.exists(out_dir): |     if not path.exists(out_dir): | ||||||
|         path.join(out_dir) |         path.join(out_dir) | ||||||
|     texts = partition(batch_size, iter_texts(in_loc)) |     texts = partition(batch_size, iter_texts_from_json_bz2(in_loc)) | ||||||
|     parallelize(transform_texts, enumerate(texts), n_workers, [out_dir]) |     parallelize(transform_texts, enumerate(texts), n_workers, [out_dir]) | ||||||
|   |   | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user