mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
* Edits to align_raw script, for use in prepare_treebank
This commit is contained in:
parent
4010b9b6d9
commit
732fa7709a
|
@ -183,13 +183,15 @@ def get_sections(odc_dir, ptb_dir, out_dir):
|
||||||
yield odc_loc, ptb_sec, out_loc
|
yield odc_loc, ptb_sec, out_loc
|
||||||
|
|
||||||
|
|
||||||
|
def align_section(raw_paragraphs, ptb_files):
|
||||||
|
aligned = get_alignment(raw_paragraphs, ptb_files)
|
||||||
|
return [(fn, group_into_paras(sents))
|
||||||
|
for fn, sents in group_into_files(aligned)]
|
||||||
|
|
||||||
|
|
||||||
def do_wsj(odc_dir, ptb_dir, out_dir):
|
def do_wsj(odc_dir, ptb_dir, out_dir):
|
||||||
for odc_loc, ptb_sec_dir, out_loc in get_sections(odc_dir, ptb_dir, out_dir):
|
for odc_loc, ptb_sec_dir, out_loc in get_sections(odc_dir, ptb_dir, out_dir):
|
||||||
raw_paragraphs = read_odc(odc_loc)
|
files = align_section(read_odc(odc_loc), read_ptb_sec(ptb_sec_dir))
|
||||||
ptb_files = read_ptb_sec(ptb_sec_dir)
|
|
||||||
aligned = get_alignment(raw_paragraphs, ptb_files)
|
|
||||||
files = [(fn, group_into_paras(sents))
|
|
||||||
for fn, sents in group_into_files(aligned)]
|
|
||||||
with open(out_loc, 'w') as file_:
|
with open(out_loc, 'w') as file_:
|
||||||
json.dump(files, file_)
|
json.dump(files, file_)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user