* Print json treebank by genre, instead of by large file

This commit is contained in:
Matthew Honnibal 2015-05-28 22:40:01 +02:00
parent 6b2e5c4b8a
commit 5eb64eeb11

View File

@ -28,6 +28,7 @@ from os import path
import os import os
import re import re
import codecs import codecs
from collections import defaultdict
from spacy.munge import read_ptb from spacy.munge import read_ptb
from spacy.munge import read_conll from spacy.munge import read_conll
@ -54,6 +55,8 @@ def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
doc = {'id': file_id} doc = {'id': file_id}
if raw_paras is None: if raw_paras is None:
doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)] doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)]
#for ptb_sent, dep_sent, ner_sent in zip(ptb_sents, dep_sents, ner_sents):
# doc['paragraphs'].append(format_para(None, [ptb_sent], [dep_sent], [ner_sent]))
else: else:
doc['paragraphs'] = [] doc['paragraphs'] = []
for raw_sents in raw_paras: for raw_sents in raw_paras:
@ -77,6 +80,8 @@ def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
assert len(ptb_sents) == len(dep_sents) == len(ner_sents) assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents): for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
_, annot = read_conll.parse(dep_text, strip_bad_periods=True) _, annot = read_conll.parse(dep_text, strip_bad_periods=True)
if annot and 'VERB' in [t['tag'] for t in annot]:
continue
if ner_text is not None: if ner_text is not None:
_, ner = read_ner.parse(ner_text, strip_bad_periods=True) _, ner = read_ner.parse(ner_text, strip_bad_periods=True)
else: else:
@ -155,20 +160,27 @@ def get_doc(onto_dir, file_path, wsj_docs):
else: else:
return None return None
def read_ids(loc): def read_ids(loc):
return open(loc).read().strip().split('\n') return open(loc).read().strip().split('\n')
def main(onto_dir, raw_dir, out_dir): def main(onto_dir, raw_dir, out_dir):
wsj_docs = read_wsj_with_source(onto_dir, raw_dir) wsj_docs = read_wsj_with_source(onto_dir, raw_dir)
for partition in ('train', 'test', 'development'): for partition in ('train', 'test', 'development'):
ids = read_ids(path.join(onto_dir, '%s.id' % partition)) ids = read_ids(path.join(onto_dir, '%s.id' % partition))
out_loc = path.join(out_dir, '%s.json' % partition) docs_by_genre = defaultdict(list)
docs = []
for file_path in ids: for file_path in ids:
doc = get_doc(onto_dir, file_path, wsj_docs) doc = get_doc(onto_dir, file_path, wsj_docs)
if doc is not None: if doc is not None:
docs.append(doc) genre = file_path.split('/')[3]
docs_by_genre[genre].append(doc)
part_dir = path.join(out_dir, partition)
if not path.exists(part_dir):
os.mkdir(part_dir)
for genre, docs in sorted(docs_by_genre.items()):
out_loc = path.join(part_dir, genre + '.json')
with open(out_loc, 'w') as file_: with open(out_loc, 'w') as file_:
json.dump(docs, file_, indent=4) json.dump(docs, file_, indent=4)