From 732fa7709a56c6a9228c67f3f67ff6e55da0a38d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 27 May 2015 04:23:31 +0200 Subject: [PATCH] * Edits to align_raw script, for use in prepare_treebank --- spacy/munge/align_raw.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/spacy/munge/align_raw.py b/spacy/munge/align_raw.py index b065c9a8e..af72f6b81 100644 --- a/spacy/munge/align_raw.py +++ b/spacy/munge/align_raw.py @@ -183,13 +183,15 @@ def get_sections(odc_dir, ptb_dir, out_dir): yield odc_loc, ptb_sec, out_loc +def align_section(raw_paragraphs, ptb_files): + aligned = get_alignment(raw_paragraphs, ptb_files) + return [(fn, group_into_paras(sents)) + for fn, sents in group_into_files(aligned)] + + def do_wsj(odc_dir, ptb_dir, out_dir): for odc_loc, ptb_sec_dir, out_loc in get_sections(odc_dir, ptb_dir, out_dir): - raw_paragraphs = read_odc(odc_loc) - ptb_files = read_ptb_sec(ptb_sec_dir) - aligned = get_alignment(raw_paragraphs, ptb_files) - files = [(fn, group_into_paras(sents)) - for fn, sents in group_into_files(aligned)] + files = align_section(read_odc(odc_loc), read_ptb_sec(ptb_sec_dir)) with open(out_loc, 'w') as file_: json.dump(files, file_)