mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Accept iob2 and allow generic whitespace (#2999)
* accept non-pipe whitespace as delimiter; allow iob2 filename * added small documentation note for IOB2 allowance * added contributor agreement
This commit is contained in:
		
							parent
							
								
									2457318b7a
								
							
						
					
					
						commit
						9c8c4287bf
					
				|  | @ -7,6 +7,8 @@ from ...compat import json_dumps, path2str | |||
| from ...util import prints | ||||
| from ...gold import iob_to_biluo | ||||
| 
 | ||||
| import re | ||||
| 
 | ||||
| 
 | ||||
| def iob2json(input_path, output_path, n_sents=10, *a, **k): | ||||
|     """ | ||||
|  | @ -15,7 +17,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): | |||
|     with input_path.open('r', encoding='utf8') as file_: | ||||
|         sentences = read_iob(file_) | ||||
|     docs = merge_sentences(sentences, n_sents) | ||||
|     output_filename = input_path.parts[-1].replace(".iob", ".json") | ||||
|     output_filename = (input_path.parts[-1] | ||||
|                        .replace(".iob2", ".json") | ||||
|                        .replace(".iob", ".json")) | ||||
|     output_file = output_path / output_filename | ||||
|     with output_file.open('w', encoding='utf-8') as f: | ||||
|         f.write(json_dumps(docs)) | ||||
|  | @ -28,7 +32,7 @@ def read_iob(raw_sents): | |||
|     for line in raw_sents: | ||||
|         if not line.strip(): | ||||
|             continue | ||||
|         tokens = [t.split('|') for t in line.split()] | ||||
|         tokens = [re.split('[^\w\-]', line.strip())] | ||||
|         if len(tokens[0]) == 3: | ||||
|             words, pos, iob = zip(*tokens) | ||||
|         else: | ||||
|  |  | |||
|  | @ -245,7 +245,7 @@ p The following file format converters are available: | |||
| 
 | ||||
|     +row | ||||
|         +cell #[code iob] | ||||
|         +cell IOB named entity recognition format. | ||||
|         +cell IOB or IOB2 named entity recognition format. | ||||
| 
 | ||||
| +h(3, "train") Train | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user