mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	Fix concatenation in iob2json converter
This commit is contained in:
		
							parent
							
								
									4896ce3320
								
							
						
					
					
						commit
						31681d20e0
					
				|  | @ -1,5 +1,6 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| from cytoolz import partition_all, concat | ||||
| 
 | ||||
| from ...compat import json_dumps, path2str | ||||
| from ...util import prints | ||||
|  | @ -10,22 +11,24 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): | |||
|     """ | ||||
|     Convert IOB files into JSON format for use with train cli. | ||||
|     """ | ||||
|     # TODO: This isn't complete yet -- need to map from IOB to | ||||
|     # BILUO | ||||
|     with input_path.open('r', encoding='utf8') as file_: | ||||
|         docs = read_iob(file_) | ||||
|         if n_sents: | ||||
|             lines = [' '.join(para) for para in partition_all(n_sents, file_)] | ||||
|         else: | ||||
|             lines = file_ | ||||
|     sentences = read_iob(lines) | ||||
| 
 | ||||
|     output_filename = input_path.parts[-1].replace(".iob", ".json") | ||||
|     output_file = output_path / output_filename | ||||
|     with output_file.open('w', encoding='utf-8') as f: | ||||
|         f.write(json_dumps(docs)) | ||||
|     prints("Created %d documents" % len(docs), | ||||
|         f.write(json_dumps(sentences)) | ||||
|     prints("Created %d documents" % len(sentences), | ||||
|            title="Generated output file %s" % path2str(output_file)) | ||||
| 
 | ||||
| 
 | ||||
| def read_iob(file_): | ||||
| def read_iob(raw_sents): | ||||
|     sentences = [] | ||||
|     for line in file_: | ||||
|     for line in raw_sents: | ||||
|         if not line.strip(): | ||||
|             continue | ||||
|         tokens = [t.split('|') for t in line.split()] | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user