mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Fix Python3 problem in align_raw
This commit is contained in:
parent
c609ea18f0
commit
dcafb181b9
|
@ -1,6 +1,8 @@
|
||||||
"""Align the raw sentences from Read et al (2012) to the PTB tokenization,
|
"""Align the raw sentences from Read et al (2012) to the PTB tokenization,
|
||||||
outputting as a .json file. Used in bin/prepare_treebank.py
|
outputting as a .json file. Used in bin/prepare_treebank.py
|
||||||
"""
|
"""
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import json
|
import json
|
||||||
|
@ -84,7 +86,6 @@ def get_alignment(raw_by_para, ptb_by_file):
|
||||||
n_skipped = 0
|
n_skipped = 0
|
||||||
skips = []
|
skips = []
|
||||||
for (p_id, p_sent_id, raw) in raw_sents:
|
for (p_id, p_sent_id, raw) in raw_sents:
|
||||||
#print raw
|
|
||||||
if ptb_idx >= len(ptb_sents):
|
if ptb_idx >= len(ptb_sents):
|
||||||
n_skipped += 1
|
n_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
@ -104,8 +105,8 @@ def get_alignment(raw_by_para, ptb_by_file):
|
||||||
output.append((f_id, p_id, f_sent_id, (ptb_id, ''.join(sepped))))
|
output.append((f_id, p_id, f_sent_id, (ptb_id, ''.join(sepped))))
|
||||||
if n_skipped + len(ptb_sents) != len(raw_sents):
|
if n_skipped + len(ptb_sents) != len(raw_sents):
|
||||||
for ptb, raw in skips:
|
for ptb, raw in skips:
|
||||||
print ptb
|
print(ptb)
|
||||||
print raw
|
print(raw)
|
||||||
raise Exception
|
raise Exception
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user