mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			90 lines
		
	
	
		
			2.7 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			90 lines
		
	
	
		
			2.7 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env python
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| from xml.etree import cElementTree as ElementTree
 | |
| import json
 | |
| import re
 | |
| 
 | |
| import plac
 | |
| from pathlib import Path
 | |
| from os import path
 | |
| 
 | |
| 
 | |
| escaped_tokens = {
 | |
|     '-LRB-': '(',
 | |
|     '-RRB-': ')',
 | |
|     '-LSB-': '[',
 | |
|     '-RSB-': ']',
 | |
|     '-LCB-': '{',
 | |
|     '-RCB-': '}',
 | |
| }
 | |
| 
 | |
| def read_parses(parse_loc):
 | |
|     offset = 0
 | |
|     doc = []
 | |
|     for parse in open(str(parse_loc) + '.dep').read().strip().split('\n\n'):
 | |
|         parse = _adjust_token_ids(parse, offset)
 | |
|         offset += len(parse.split('\n'))
 | |
|         doc.append(parse)
 | |
|     return doc
 | |
| 
 | |
| def _adjust_token_ids(parse, offset):
 | |
|     output = []
 | |
|     for line in parse.split('\n'):
 | |
|         pieces = line.split()
 | |
|         pieces[0] = str(int(pieces[0]) + offset)
 | |
|         pieces[5] = str(int(pieces[5]) + offset) if pieces[5] != '0' else '0'
 | |
|         output.append('\t'.join(pieces))
 | |
|     return '\n'.join(output)
 | |
| 
 | |
| 
 | |
| def _fmt_doc(filename, paras):
 | |
|     return {'id': filename, 'paragraphs': [_fmt_para(*para) for para in paras]}
 | |
| 
 | |
| 
 | |
| def _fmt_para(raw, sents):
 | |
|     return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]}
 | |
| 
 | |
| 
 | |
| def _fmt_sent(sent):
 | |
|     return {
 | |
|         'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')],
 | |
|         'brackets': []}
 | |
| 
 | |
| 
 | |
| def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3):
 | |
|     head = int(head) - 1
 | |
|     id_ = int(id_) - 1
 | |
|     head = (head - id_) if head != -1 else 0
 | |
|     return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head}
 | |
| 
 | |
| 
 | |
| tags_re = re.compile(r'<[\w\?/][^>]+>')
 | |
| def main(out_dir, ewtb_dir='/usr/local/data/eng_web_tbk'):
 | |
|     ewtb_dir = Path(ewtb_dir)
 | |
|     out_dir = Path(out_dir)
 | |
|     if not out_dir.exists():
 | |
|         out_dir.mkdir()
 | |
|     for genre_dir in ewtb_dir.joinpath('data').iterdir():
 | |
|         #if 'answers' in str(genre_dir): continue
 | |
|         parse_dir = genre_dir.joinpath('penntree')
 | |
|         docs = []
 | |
|         for source_loc in genre_dir.joinpath('source').joinpath('source_original').iterdir():
 | |
|             filename = source_loc.parts[-1].replace('.sgm.sgm', '')
 | |
|             filename = filename.replace('.xml', '')
 | |
|             filename = filename.replace('.txt', '')
 | |
|             parse_loc = parse_dir.joinpath(filename + '.xml.tree')
 | |
|             parses = read_parses(parse_loc)
 | |
|             source = source_loc.open().read().strip()
 | |
|             if 'answers' in str(genre_dir):
 | |
|                 source = tags_re.sub('', source).strip()
 | |
|             docs.append(_fmt_doc(filename, [[source, parses]]))
 | |
| 
 | |
|         out_loc = out_dir.joinpath(genre_dir.parts[-1] + '.json')
 | |
|         with open(str(out_loc), 'w') as out_file:
 | |
|             out_file.write(json.dumps(docs, indent=4))
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     plac.call(main)
 |