mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	* Write JSON files, with both dependency and PSG parses
This commit is contained in:
		
							parent
							
								
									5078a32213
								
							
						
					
					
						commit
						0ad72a77ce
					
				|  | @ -60,15 +60,12 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc): | |||
|                     'brackets': []} | ||||
|         for raw_sent in raw_sents: | ||||
|             para['sents'].append(offset)  | ||||
|             _, brackets = read_ptb.parse(ptb_sents[i]) | ||||
|             _, annot = read_conll.parse(dep_sents[i]) | ||||
|             _, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True) | ||||
|             _, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True) | ||||
|             indices, word_idx, offset = _get_word_indices(raw_sent, 0, offset) | ||||
| 
 | ||||
|             for token in annot: | ||||
|                 if token['head'] == -1: | ||||
|                     head = indices[token['id']] | ||||
|                 else: | ||||
|                     head = indices[token['head']] | ||||
|                 head = indices[token['head']] | ||||
|                 try: | ||||
|                     para['tokens'].append({'start': indices[token['id']], | ||||
|                         'tag': token['tag'], | ||||
|  | @ -80,32 +77,34 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc): | |||
|                     print raw_sent | ||||
|                     raise | ||||
|             for label, start, end in brackets: | ||||
|                 para['brackets'].append({'label': label, | ||||
|                     'start': indices[start], | ||||
|                     'end': indices[end-1]}) | ||||
|                 if start != end: | ||||
|                     para['brackets'].append({'label': label, | ||||
|                         'start': indices[start], | ||||
|                         'end': indices[end-1]}) | ||||
|             i += 1 | ||||
|         doc['paragraphs'].append(para) | ||||
|     return doc | ||||
| 
 | ||||
| 
 | ||||
| def main(onto_dir, raw_dir, out_loc): | ||||
|     docs = [] | ||||
| def main(onto_dir, raw_dir, out_dir): | ||||
|     for i in range(25): | ||||
|         section = str(i) if i >= 10 else ('0' + str(i)) | ||||
|         raw_loc = path.join(raw_dir, 'wsj%s.json' % section) | ||||
|         docs = [] | ||||
|         for j, raw_paras in enumerate(_iter_raw_files(raw_loc)): | ||||
|             if section == '00': | ||||
|                 j += 1 | ||||
|             filename = str(j) if j >= 9 else ('0' + str(j)) | ||||
|             if section == '04' and filename == '55': | ||||
|                 continue | ||||
|             ptb_loc = path.join(onto_dir, section, 'wsj_%s%s.parse' % (section, filename)) | ||||
|             dep_loc = ptb_loc + '.dep' | ||||
|             ptb_loc = path.join(onto_dir, section, 'wsj_%s%s.mrg' % (section, filename)) | ||||
|             dep_loc = ptb_loc + '.3.pa.gs.tab' | ||||
|             if path.exists(ptb_loc) and path.exists(dep_loc): | ||||
|                 print ptb_loc | ||||
|                 doc = format_doc(section, filename, raw_paras, ptb_loc, dep_loc) | ||||
|                 docs.append(doc) | ||||
|     json.dump(docs, open(out_loc, 'w')) | ||||
|         with open(path.join(out_dir, '%s.json' % section), 'w') as file_: | ||||
|             json.dump(docs, file_) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/munge/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/munge/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										175
									
								
								spacy/munge/align_raw.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										175
									
								
								spacy/munge/align_raw.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,175 @@ | |||
| """Align the raw sentences from Read et al (2012) to the PTB tokenization, | ||||
| outputing the format: | ||||
| 
 | ||||
| [{ | ||||
|     section: int, | ||||
|     file: string, | ||||
|     paragraphs: [{ | ||||
|         raw: string, | ||||
|         segmented: string, | ||||
|         tokens: [int]}]}] | ||||
| """ | ||||
| import plac | ||||
| from pathlib import Path | ||||
| import json | ||||
| from os import path | ||||
| 
 | ||||
| from spacy.munge import read_ptb | ||||
| 
 | ||||
| 
 | ||||
| def read_unsegmented(section_loc): | ||||
|     # Arbitrary patches applied to the _raw_ text to promote alignment. | ||||
|     patches = ( | ||||
|         ('. . . .', '...'), | ||||
|         ('....', '...'), | ||||
|         ('Co..', 'Co.'), | ||||
|         ("`", "'"), | ||||
|     ) | ||||
|      | ||||
|     paragraphs = [] | ||||
|     with open(section_loc) as file_: | ||||
|         para = [] | ||||
|         for line in file_: | ||||
|             if line.startswith('['): | ||||
|                 line = line.split('|', 1)[1].strip() | ||||
|                 for find, replace in patches: | ||||
|                     line = line.replace(find, replace) | ||||
|                 para.append(line) | ||||
|             else: | ||||
|                 paragraphs.append(para) | ||||
|                 para = [] | ||||
|         paragraphs.append(para) | ||||
|     return paragraphs | ||||
| 
 | ||||
| 
 | ||||
| def read_ptb_sec(ptb_sec_dir): | ||||
|     ptb_sec_dir = Path(ptb_sec_dir) | ||||
|     files = [] | ||||
|     for loc in ptb_sec_dir.iterdir(): | ||||
|         if not str(loc).endswith('parse') and not str(loc).endswith('mrg'): | ||||
|             continue | ||||
|         with loc.open() as file_: | ||||
|             text = file_.read() | ||||
|         sents = [] | ||||
|         for parse_str in read_ptb.split(text): | ||||
|             words, brackets = read_ptb.parse(parse_str, strip_bad_periods=True) | ||||
|             words = [_reform_ptb_word(word) for word in words] | ||||
|             string = ' '.join(words) | ||||
|             sents.append(string) | ||||
|         files.append(sents) | ||||
|     return files | ||||
| 
 | ||||
| 
 | ||||
| def _reform_ptb_word(tok): | ||||
|     tok = tok.replace("``", '"') | ||||
|     tok = tok.replace("`", "'") | ||||
|     tok = tok.replace("''", '"') | ||||
|     tok = tok.replace('\\', '') | ||||
|     tok = tok.replace('-LCB-', '{') | ||||
|     tok = tok.replace('-RCB-', '}') | ||||
|     tok = tok.replace('-RRB-', ')') | ||||
|     tok = tok.replace('-LRB-', '(') | ||||
|     tok = tok.replace("'T-", "'T") | ||||
|     return tok | ||||
|   | ||||
| 
 | ||||
| def get_alignment(raw_by_para, ptb_by_file): | ||||
|     # These are list-of-lists, by paragraph and file respectively. | ||||
|     # Flatten them into a list of (outer_id, inner_id, item) triples | ||||
|     raw_sents = _flatten(raw_by_para) | ||||
|     ptb_sents = _flatten(ptb_by_file) | ||||
| 
 | ||||
|     assert len(raw_sents) == len(ptb_sents) | ||||
| 
 | ||||
|     output = [] | ||||
|     for (p_id, p_sent_id, raw), (f_id, f_sent_id, ptb) in zip(raw_sents, ptb_sents): | ||||
|         alignment = align_chars(raw, ptb) | ||||
|         sepped = [] | ||||
|         for i, c in enumerate(ptb): | ||||
|             if alignment[i] is False: | ||||
|                 sepped.append('<SEP>') | ||||
|             else: | ||||
|                 sepped.append(c) | ||||
|         output.append((f_id, p_id, f_sent_id, ''.join(sepped))) | ||||
|     return output | ||||
| 
 | ||||
| 
 | ||||
| def _flatten(nested): | ||||
|     flat = [] | ||||
|     for id1, inner in enumerate(nested): | ||||
|         flat.extend((id1, id2, item) for id2, item in enumerate(inner)) | ||||
|     return flat | ||||
| 
 | ||||
| 
 | ||||
| def align_chars(raw, ptb): | ||||
|     i = 0 | ||||
|     j = 0 | ||||
| 
 | ||||
|     length = len(raw) | ||||
|     alignment = [False for _ in range(len(ptb))] | ||||
|     while i < length: | ||||
|         if raw[i] == ' ' and ptb[j] == ' ': | ||||
|             alignment[j] = True | ||||
|             i += 1 | ||||
|             j += 1 | ||||
|         elif raw[i] == ' ': | ||||
|             i += 1 | ||||
|         elif ptb[j] == ' ': | ||||
|             j += 1 | ||||
|         assert raw[i].lower() == ptb[j].lower(), raw[i:1] | ||||
|         alignment[j] = i | ||||
|         i += 1; j += 1 | ||||
|     return alignment | ||||
| 
 | ||||
| 
 | ||||
| def group_into_files(sents): | ||||
|     last_id = 0 | ||||
|     this = [] | ||||
|     output = [] | ||||
|     for f_id, p_id, s_id, sent in sents: | ||||
|         if f_id != last_id: | ||||
|             output.append(this) | ||||
|             this = [] | ||||
|         this.append((f_id, p_id, s_id, sent)) | ||||
|         last_id = f_id | ||||
|     if this: | ||||
|         output.append(this) | ||||
|     return output | ||||
| 
 | ||||
| 
 | ||||
| def group_into_paras(sents): | ||||
|     last_id = 0 | ||||
|     this = [] | ||||
|     output = [] | ||||
|     for f_id, p_id, s_id, sent in sents: | ||||
|         if p_id != last_id and this: | ||||
|             output.append(this) | ||||
|             this = [] | ||||
|         this.append((sent)) | ||||
|         last_id = p_id | ||||
|     if this: | ||||
|         output.append(this) | ||||
|     return output | ||||
| 
 | ||||
| 
 | ||||
| def get_sections(odc_dir, ptb_dir, out_dir): | ||||
|     for i in range(25): | ||||
|         section = str(i) if i >= 10 else ('0' + str(i)) | ||||
|         odc_loc = path.join(odc_dir, 'wsj%s.txt' % section) | ||||
|         ptb_sec = path.join(ptb_dir, section) | ||||
|         out_loc = path.join(out_dir, 'wsj%s.json' % section) | ||||
|         yield odc_loc, ptb_sec, out_loc | ||||
| 
 | ||||
| 
 | ||||
| def main(odc_dir, ptb_dir, out_dir): | ||||
|     for odc_loc, ptb_sec_dir, out_loc in get_sections(odc_dir, ptb_dir, out_dir): | ||||
|         raw_paragraphs = read_unsegmented(odc_loc) | ||||
|         ptb_files = read_ptb_sec(ptb_sec_dir) | ||||
|         aligned = get_alignment(raw_paragraphs, ptb_files) | ||||
|         files = [group_into_paras(f) for f in group_into_files(aligned)] | ||||
|         with open(out_loc, 'w') as file_: | ||||
|             json.dump(files, file_) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     plac.call(main) | ||||
							
								
								
									
										40
									
								
								spacy/munge/read_conll.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								spacy/munge/read_conll.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,40 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| def split(text): | ||||
|     return [sent.strip() for sent in text.split('\n\n') if sent.strip()] | ||||
| 
 | ||||
| 
 | ||||
| def parse(sent_text, strip_bad_periods=False): | ||||
|     sent_text = sent_text.strip() | ||||
|     assert sent_text | ||||
|     annot = [] | ||||
|     words = [] | ||||
|     i = 0 | ||||
|     for line in sent_text.split('\n'): | ||||
|         word, tag, head, dep = line.split() | ||||
|         if strip_bad_periods and words and _is_bad_period(words[-1], word): | ||||
|             continue | ||||
|    | ||||
|         annot.append({ | ||||
|             'id': i, | ||||
|             'word': word, | ||||
|             'tag': tag, | ||||
|             'head': int(head) - 1 if int(head) != 0 else i, | ||||
|             'dep': dep}) | ||||
|         words.append(word) | ||||
|         i += 1 | ||||
|     return words, annot | ||||
| 
 | ||||
| 
 | ||||
| def _is_bad_period(prev, period): | ||||
|     if period != '.': | ||||
|         return False | ||||
|     elif prev == '.': | ||||
|         return False | ||||
|     elif not prev.endswith('.'): | ||||
|         return False | ||||
|     else: | ||||
|         return True | ||||
| 
 | ||||
| 
 | ||||
							
								
								
									
										65
									
								
								spacy/munge/read_ptb.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										65
									
								
								spacy/munge/read_ptb.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,65 @@ | |||
| import re | ||||
| import os | ||||
| from os import path | ||||
| 
 | ||||
| 
 | ||||
| def parse(sent_text, strip_bad_periods=False): | ||||
|     sent_text = sent_text.strip() | ||||
|     assert sent_text and sent_text.startswith('(') | ||||
|     open_brackets = [] | ||||
|     brackets = [] | ||||
|     bracketsRE = re.compile(r'(\()([^\s\)\(]+)|([^\s\)\(]+)?(\))') | ||||
|     word_i = 0 | ||||
|     words = [] | ||||
|     # Remove outermost bracket | ||||
|     if sent_text.startswith('(('): | ||||
|         sent_text = sent_text.replace('((', '( (', 1) | ||||
|     for match in bracketsRE.finditer(sent_text[2:-1]): | ||||
|         open_, label, text, close = match.groups() | ||||
|         if open_: | ||||
|             assert not close | ||||
|             assert label.strip() | ||||
|             open_brackets.append((label, word_i)) | ||||
|         else: | ||||
|             assert close | ||||
|             label, start = open_brackets.pop() | ||||
|             assert label.strip() | ||||
|             if strip_bad_periods and words and _is_bad_period(words[-1], text): | ||||
|                 continue | ||||
|             # Traces leave 0-width bracket, but no token | ||||
|             if text and label != '-NONE-': | ||||
|                 words.append(text) | ||||
|                 word_i += 1 | ||||
|             else: | ||||
|                 brackets.append((label, start, word_i)) | ||||
|     return words, brackets | ||||
| 
 | ||||
| 
 | ||||
| def _is_bad_period(prev, period): | ||||
|     if period != '.': | ||||
|         return False | ||||
|     elif prev == '.': | ||||
|         return False | ||||
|     elif not prev.endswith('.'): | ||||
|         return False | ||||
|     else: | ||||
|         return True | ||||
| 
 | ||||
| 
 | ||||
| def split(text): | ||||
|     sentences = [] | ||||
|     current = [] | ||||
| 
 | ||||
|     for line in text.strip().split('\n'): | ||||
|         line = line.rstrip() | ||||
|         if not line: | ||||
|             continue | ||||
|         # Detect the start of sentences by line starting with ( | ||||
|         # This is messy, but it keeps bracket parsing at the sentence level | ||||
|         if line.startswith('(') and current: | ||||
|             sentences.append('\n'.join(current)) | ||||
|             current = [] | ||||
|         current.append(line) | ||||
|     if current: | ||||
|         sentences.append('\n'.join(current)) | ||||
|     return sentences | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user