From 4073533e2884fa85587ba7dad8d24299e23ec79d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2015 02:10:33 +0200 Subject: [PATCH] * Upd munge_ewtb for the new json format --- bin/munge_ewtb.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/bin/munge_ewtb.py b/bin/munge_ewtb.py index 3a10108e6..4e21ceb07 100755 --- a/bin/munge_ewtb.py +++ b/bin/munge_ewtb.py @@ -43,14 +43,20 @@ def _fmt_doc(filename, paras): def _fmt_para(raw, sents): - # Get sentence starts - starts = [int(sent.split()[0]) for sent in sents] - return {'raw': raw, 'sents': starts, - 'tokens': [_fmt_token(*t.split()) for t in '\n'.join(sents).split('\n')]} + return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]} + + +def _fmt_sent(sent): + return { + 'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')], + 'brackets': []} def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3): - return {'id': int(id_)-1, 'orth': word, 'tag': pos, 'dep': dep, 'head': int(head)-1} + head = int(head) - 1 + id_ = int(id_) - 1 + head = (head - id_) if head != -1 else 0 + return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head} tags_re = re.compile(r'<[\w\?/][^>]+>')