From e140e03516845cd1bc507420c2bcbe1f3ae6571c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 27 May 2015 17:04:29 +0200
Subject: [PATCH] * Read in OntoNotes. Doesn't support train/test/dev split yet

---
 bin/prepare_treebank.py | 191 ++++++++++++++++++++++++++++------------
 1 file changed, 133 insertions(+), 58 deletions(-)
diff --git a/bin/prepare_treebank.py b/bin/prepare_treebank.py
index acd544944..34c2de3e6 100644
--- a/bin/prepare_treebank.py
+++ b/bin/prepare_treebank.py
@@ -21,10 +21,13 @@ doc: {
 
 Consumes output of spacy/munge/align_raw.py
 """
+from __future__ import unicode_literals
 import plac
 import json
 from os import path
+import os
 import re
+import codecs
 
 from spacy.munge import read_ptb
 from spacy.munge import read_conll
@@ -40,78 +43,150 @@ def _iter_raw_files(raw_loc):
 def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
     ptb_sents = read_ptb.split(ptb_text)
     dep_sents = read_conll.split(dep_text)
-    ner_sents = read_ner.split(ner_text) if ner_text is not None else None
-
-    assert len(ptb_sents) == len(dep_sents)
+    if len(ptb_sents) != len(dep_sents):
+        return None
+    if ner_text is not None:
+        ner_sents = read_ner.split(ner_text)
+    else:
+        ner_sents = [None] * len(ptb_sents)
 
     i = 0
-    doc = {'id': file_id, 'paragraphs': []}
-    for raw_sents in raw_paras:
-        para = {
-            'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
-            'sents': [],
-            'tokens': [],
-            'brackets': [],
-            'entities': []}
-        offset = 0
-        for raw_sent in raw_sents:
-            _, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)
-            _, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
-            if ner_sents is not None:
-                _, ner = read_ner.parse(ner_sents[i], strip_bad_periods=True)
-                assert len(ner) == len(annot)
-            else:
-                ner = ['-' for _ in annot]
-            for token_id, token in enumerate(annot):
-                try:
-                    head = (token['head'] + offset) if token['head'] != -1 else -1
-                    para['tokens'].append({
-                        'id': offset + token_id,
-                        'orth': token['word'],
-                        'tag': token['tag'],
-                        'head': head,
-                        'dep': token['dep'],
-                        'ner': ner[token_id]})
-                except:
-                    raise
-            for label, start, end in brackets:
-                if start != end:
-                    para['brackets'].append({
-                        'label': label,
-                        'first': start + offset,
-                        'last': (end-1) + offset})
-            i += 1
-            offset += len(annot)
-            para['sents'].append(offset)
-        doc['paragraphs'].append(para)
+    doc = {'id': file_id}
+    if raw_paras is None:
+        doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)]
+    else:
+        doc['paragraphs'] = []
+        for raw_sents in raw_paras:
+            doc['paragraphs'].append(
+                format_para(
+                    ' '.join(raw_sents).replace('<SEP>', ''),
+                    ptb_sents[i:i+len(raw_sents)],
+                    dep_sents[i:i+len(raw_sents)],
+                    ner_sents[i:i+len(raw_sents)]))
+            i += len(raw_sents)
     return doc
 
 
-def main(onto_dir, raw_dir, out_dir):
+def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
+    para = {
+        'raw': raw_text,
+        'sents': [],
+        'tokens': [],
+        'brackets': []}
+    offset = 0
+    assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
+    for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
+        _, annot = read_conll.parse(dep_text, strip_bad_periods=True)
+        if ner_text is not None:
+            _, ner = read_ner.parse(ner_text, strip_bad_periods=True)
+        else:
+            ner = ['-' for _ in annot]
+        for token_id, (token, token_ent) in enumerate(zip(annot, ner)):
+            para['tokens'].append(format_token(offset, token_id, token, token_ent))
+
+        _, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
+        for label, start, end in brackets:
+            if start != end:
+                para['brackets'].append({
+                    'label': label,
+                    'first': start + offset,
+                    'last': (end-1) + offset})
+        offset += len(annot)
+        para['sents'].append(offset)
+    return para
+
+
+def format_token(offset, token_id, token, ner):
+    head = (token['head'] + offset) if token['head'] != -1 else -1
+    return {
+        'id': offset + token_id,
+        'orth': token['word'],
+        'tag': token['tag'],
+        'head': head,
+        'dep': token['dep'],
+        'ner': ner}
+
+
+def read_file(*pieces):
+    loc = path.join(*pieces)
+    if not path.exists(loc):
+        return None
+    else:
+        return codecs.open(loc, 'r', 'utf8').read().strip()
+
+
+def get_file_names(section_dir, subsection):
+    filenames = []
+    for fn in os.listdir(path.join(section_dir, subsection)):
+        filenames.append(fn.rsplit('.', 1)[0])
+    return list(sorted(set(filenames)))
+
+
+def main(onto_dir, raw_dir, out_loc):
+    # All but WSJ --- we do that separately, as we have the source docs
+    sections = [
+        'bc/cctv',
+        'bc/cnn',
+        'bc/msnbc',
+        'bc/p2.5_a2e',
+        'bc/p2.5_c2e',
+        'bc/phoenix',
+        'bn/abc',
+        'bn/cnn',
+        'bn/mnb',
+        'bn/nbc',
+        'bn/p2.5_a2e',
+        'bn/p2.5_c2e',
+        'bn/pri',
+        'bn/voa',
+        'mz/sinorama',
+        'nw/dev_09_c2e',
+        'nw/p2.5_a2e',
+        'nw/p2.5_c2e',
+        'nw/xinhua',
+        'pt/ot',
+        'tc/ch',
+        'wb/a2e',
+        'wb/c2e',
+        'wb/eng',
+        'wb/dev_09_c2e',
+        'wb/p2.5_a2e',
+        'wb/p2.5_c2e',
+        'wb/sel'
+    ]
+    docs = []
+    for section in sections:
+        section_dir = path.join(onto_dir, 'data', 'english', 'annotations', section)
+        print section, len(docs)
+        for subsection in os.listdir(section_dir):
+            for fn in get_file_names(section_dir, subsection):
+                ptb = read_file(section_dir, subsection, '%s.parse' % fn)
+                dep = read_file(section_dir, subsection, '%s.parse.dep' % fn)
+                ner = read_file(section_dir, subsection, '%s.name' % fn)
+                if ptb is not None:
+                    doc = format_doc(fn, None, ptb, dep, ner)
+                    if doc is not None:
+                        docs.append(doc)
+    # Now do WSJ, with source alignment
+    onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj')
     for i in range(25):
         section = str(i) if i >= 10 else ('0' + str(i))
         raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
-        docs = []
         for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
             if section == '00':
                 j += 1
             if section == '04' and filename == '55':
                 continue
-            ptb_loc = path.join(onto_dir, section, '%s.parse' % filename)
-            dep_loc = ptb_loc + '.dep'
-            ner_loc = path.join(onto_dir, section, '%s.name' % filename)
-            if path.exists(ptb_loc) and path.exists(dep_loc) and path.exists(ner_loc):
-                docs.append(
-                    format_doc(
-                        filename,
-                        raw_paras,
-                        open(ptb_loc).read().strip(),
-                        open(dep_loc).read().strip(),
-                        open(ner_loc).read().strip() if path.exists(ner_loc) else None))
-        with open(path.join(out_dir, '%s.json' % section), 'w') as file_:
-            json.dump(docs, file_, indent=4)
+            ptb = read_file(onto_dir, section, '%s.parse' % filename)
+            dep = read_file(onto_dir, section, '%s.parse.dep' % filename)
+            ner = read_file(onto_dir, section, '%s.name' % filename)
+            if ptb is not None and dep is not None:
+                docs.append(format_doc(filename, raw_paras, ptb, dep, ner))
+    print 'nw/wsj', len(docs)
+    with open(out_loc, 'w') as file_:
+        json.dump(docs, file_, indent=4)
+
 
 
 if __name__ == '__main__':
     plac.call(main)
-