From e140e03516845cd1bc507420c2bcbe1f3ae6571c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 27 May 2015 17:04:29 +0200 Subject: [PATCH] * Read in OntoNotes. Doesn't support train/test/dev split yet --- bin/prepare_treebank.py | 191 ++++++++++++++++++++++++++++------------ 1 file changed, 133 insertions(+), 58 deletions(-) diff --git a/bin/prepare_treebank.py b/bin/prepare_treebank.py index acd544944..34c2de3e6 100644 --- a/bin/prepare_treebank.py +++ b/bin/prepare_treebank.py @@ -21,10 +21,13 @@ doc: { Consumes output of spacy/munge/align_raw.py """ +from __future__ import unicode_literals import plac import json from os import path +import os import re +import codecs from spacy.munge import read_ptb from spacy.munge import read_conll @@ -40,78 +43,150 @@ def _iter_raw_files(raw_loc): def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text): ptb_sents = read_ptb.split(ptb_text) dep_sents = read_conll.split(dep_text) - ner_sents = read_ner.split(ner_text) if ner_text is not None else None - - assert len(ptb_sents) == len(dep_sents) + if len(ptb_sents) != len(dep_sents): + return None + if ner_text is not None: + ner_sents = read_ner.split(ner_text) + else: + ner_sents = [None] * len(ptb_sents) i = 0 - doc = {'id': file_id, 'paragraphs': []} - for raw_sents in raw_paras: - para = { - 'raw': ' '.join(sent.replace('', '') for sent in raw_sents), - 'sents': [], - 'tokens': [], - 'brackets': [], - 'entities': []} - offset = 0 - for raw_sent in raw_sents: - _, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True) - _, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True) - if ner_sents is not None: - _, ner = read_ner.parse(ner_sents[i], strip_bad_periods=True) - assert len(ner) == len(annot) - else: - ner = ['-' for _ in annot] - for token_id, token in enumerate(annot): - try: - head = (token['head'] + offset) if token['head'] != -1 else -1 - para['tokens'].append({ - 'id': offset + token_id, - 'orth': token['word'], - 'tag': token['tag'], - 'head': head, - 'dep': token['dep'], - 'ner': ner[token_id]}) - except: - raise - for label, start, end in brackets: - if start != end: - para['brackets'].append({ - 'label': label, - 'first': start + offset, - 'last': (end-1) + offset}) - i += 1 - offset += len(annot) - para['sents'].append(offset) - doc['paragraphs'].append(para) + doc = {'id': file_id} + if raw_paras is None: + doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)] + else: + doc['paragraphs'] = [] + for raw_sents in raw_paras: + doc['paragraphs'].append( + format_para( + ' '.join(raw_sents).replace('', ''), + ptb_sents[i:i+len(raw_sents)], + dep_sents[i:i+len(raw_sents)], + ner_sents[i:i+len(raw_sents)])) + i += len(raw_sents) return doc -def main(onto_dir, raw_dir, out_dir): +def format_para(raw_text, ptb_sents, dep_sents, ner_sents): + para = { + 'raw': raw_text, + 'sents': [], + 'tokens': [], + 'brackets': []} + offset = 0 + assert len(ptb_sents) == len(dep_sents) == len(ner_sents) + for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents): + _, annot = read_conll.parse(dep_text, strip_bad_periods=True) + if ner_text is not None: + _, ner = read_ner.parse(ner_text, strip_bad_periods=True) + else: + ner = ['-' for _ in annot] + for token_id, (token, token_ent) in enumerate(zip(annot, ner)): + para['tokens'].append(format_token(offset, token_id, token, token_ent)) + + _, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True) + for label, start, end in brackets: + if start != end: + para['brackets'].append({ + 'label': label, + 'first': start + offset, + 'last': (end-1) + offset}) + offset += len(annot) + para['sents'].append(offset) + return para + + +def format_token(offset, token_id, token, ner): + head = (token['head'] + offset) if token['head'] != -1 else -1 + return { + 'id': offset + token_id, + 'orth': token['word'], + 'tag': token['tag'], + 'head': head, + 'dep': token['dep'], + 'ner': ner} + + +def read_file(*pieces): + loc = path.join(*pieces) + if not path.exists(loc): + return None + else: + return codecs.open(loc, 'r', 'utf8').read().strip() + + +def get_file_names(section_dir, subsection): + filenames = [] + for fn in os.listdir(path.join(section_dir, subsection)): + filenames.append(fn.rsplit('.', 1)[0]) + return list(sorted(set(filenames))) + + +def main(onto_dir, raw_dir, out_loc): + # All but WSJ --- we do that separately, as we have the source docs + sections = [ + 'bc/cctv', + 'bc/cnn', + 'bc/msnbc', + 'bc/p2.5_a2e', + 'bc/p2.5_c2e', + 'bc/phoenix', + 'bn/abc', + 'bn/cnn', + 'bn/mnb', + 'bn/nbc', + 'bn/p2.5_a2e', + 'bn/p2.5_c2e', + 'bn/pri', + 'bn/voa', + 'mz/sinorama', + 'nw/dev_09_c2e', + 'nw/p2.5_a2e', + 'nw/p2.5_c2e', + 'nw/xinhua', + 'pt/ot', + 'tc/ch', + 'wb/a2e', + 'wb/c2e', + 'wb/eng', + 'wb/dev_09_c2e', + 'wb/p2.5_a2e', + 'wb/p2.5_c2e', + 'wb/sel' + ] + docs = [] + for section in sections: + section_dir = path.join(onto_dir, 'data', 'english', 'annotations', section) + print section, len(docs) + for subsection in os.listdir(section_dir): + for fn in get_file_names(section_dir, subsection): + ptb = read_file(section_dir, subsection, '%s.parse' % fn) + dep = read_file(section_dir, subsection, '%s.parse.dep' % fn) + ner = read_file(section_dir, subsection, '%s.name' % fn) + if ptb is not None: + doc = format_doc(fn, None, ptb, dep, ner) + if doc is not None: + docs.append(doc) + # Now do WSJ, with source alignment + onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj') for i in range(25): section = str(i) if i >= 10 else ('0' + str(i)) raw_loc = path.join(raw_dir, 'wsj%s.json' % section) - docs = [] for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)): if section == '00': j += 1 if section == '04' and filename == '55': continue - ptb_loc = path.join(onto_dir, section, '%s.parse' % filename) - dep_loc = ptb_loc + '.dep' - ner_loc = path.join(onto_dir, section, '%s.name' % filename) - if path.exists(ptb_loc) and path.exists(dep_loc) and path.exists(ner_loc): - docs.append( - format_doc( - filename, - raw_paras, - open(ptb_loc).read().strip(), - open(dep_loc).read().strip(), - open(ner_loc).read().strip() if path.exists(ner_loc) else None)) - with open(path.join(out_dir, '%s.json' % section), 'w') as file_: - json.dump(docs, file_, indent=4) + ptb = read_file(onto_dir, section, '%s.parse' % filename) + dep = read_file(onto_dir, section, '%s.parse.dep' % filename) + ner = read_file(onto_dir, section, '%s.name' % filename) + if ptb is not None and dep is not None: + docs.append(format_doc(filename, raw_paras, ptb, dep, ner)) + print 'nw/wsj', len(docs) + with open(out_loc, 'w') as file_: + json.dump(docs, file_, indent=4) + if __name__ == '__main__': plac.call(main) -