mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Accept iob2 and allow generic whitespace (#2999)
* accept non-pipe whitespace as delimiter; allow iob2 filename * added small documentation note for IOB2 allowance * added contributor agreement
This commit is contained in:
		
							parent
							
								
									2457318b7a
								
							
						
					
					
						commit
						9c8c4287bf
					
				|  | @ -7,6 +7,8 @@ from ...compat import json_dumps, path2str | ||||||
| from ...util import prints | from ...util import prints | ||||||
| from ...gold import iob_to_biluo | from ...gold import iob_to_biluo | ||||||
| 
 | 
 | ||||||
|  | import re | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def iob2json(input_path, output_path, n_sents=10, *a, **k): | def iob2json(input_path, output_path, n_sents=10, *a, **k): | ||||||
|     """ |     """ | ||||||
|  | @ -15,7 +17,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): | ||||||
|     with input_path.open('r', encoding='utf8') as file_: |     with input_path.open('r', encoding='utf8') as file_: | ||||||
|         sentences = read_iob(file_) |         sentences = read_iob(file_) | ||||||
|     docs = merge_sentences(sentences, n_sents) |     docs = merge_sentences(sentences, n_sents) | ||||||
|     output_filename = input_path.parts[-1].replace(".iob", ".json") |     output_filename = (input_path.parts[-1] | ||||||
|  |                        .replace(".iob2", ".json") | ||||||
|  |                        .replace(".iob", ".json")) | ||||||
|     output_file = output_path / output_filename |     output_file = output_path / output_filename | ||||||
|     with output_file.open('w', encoding='utf-8') as f: |     with output_file.open('w', encoding='utf-8') as f: | ||||||
|         f.write(json_dumps(docs)) |         f.write(json_dumps(docs)) | ||||||
|  | @ -28,7 +32,7 @@ def read_iob(raw_sents): | ||||||
|     for line in raw_sents: |     for line in raw_sents: | ||||||
|         if not line.strip(): |         if not line.strip(): | ||||||
|             continue |             continue | ||||||
|         tokens = [t.split('|') for t in line.split()] |         tokens = [re.split('[^\w\-]', line.strip())] | ||||||
|         if len(tokens[0]) == 3: |         if len(tokens[0]) == 3: | ||||||
|             words, pos, iob = zip(*tokens) |             words, pos, iob = zip(*tokens) | ||||||
|         else: |         else: | ||||||
|  |  | ||||||
|  | @ -245,7 +245,7 @@ p The following file format converters are available: | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code iob] |         +cell #[code iob] | ||||||
|         +cell IOB named entity recognition format. |         +cell IOB or IOB2 named entity recognition format. | ||||||
| 
 | 
 | ||||||
| +h(3, "train") Train | +h(3, "train") Train | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user