* Fix Python3 problem in align_raw

This commit is contained in:
Matthew Honnibal 2015-07-28 15:52:10 +02:00
parent c609ea18f0
commit dcafb181b9

View File

@ -1,6 +1,8 @@
"""Align the raw sentences from Read et al (2012) to the PTB tokenization, """Align the raw sentences from Read et al (2012) to the PTB tokenization,
outputting as a .json file. Used in bin/prepare_treebank.py outputting as a .json file. Used in bin/prepare_treebank.py
""" """
from __future__ import unicode_literals
import plac import plac
from pathlib import Path from pathlib import Path
import json import json
@ -84,7 +86,6 @@ def get_alignment(raw_by_para, ptb_by_file):
n_skipped = 0 n_skipped = 0
skips = [] skips = []
for (p_id, p_sent_id, raw) in raw_sents: for (p_id, p_sent_id, raw) in raw_sents:
#print raw
if ptb_idx >= len(ptb_sents): if ptb_idx >= len(ptb_sents):
n_skipped += 1 n_skipped += 1
continue continue
@ -104,8 +105,8 @@ def get_alignment(raw_by_para, ptb_by_file):
output.append((f_id, p_id, f_sent_id, (ptb_id, ''.join(sepped)))) output.append((f_id, p_id, f_sent_id, (ptb_id, ''.join(sepped))))
if n_skipped + len(ptb_sents) != len(raw_sents): if n_skipped + len(ptb_sents) != len(raw_sents):
for ptb, raw in skips: for ptb, raw in skips:
print ptb print(ptb)
print raw print(raw)
raise Exception raise Exception
return output return output