spaCy/bin/prepare_treebank.py

"""Convert OntoNotes into a json format.

doc: {
    id: string,
    paragraphs: [{
        raw: string,
        sents: [int],
        tokens: [{
            start: int,
            tag: string,
            head: int,
            dep: string}],
        ner: [{
            start: int,
            end: int,
            label: string}],
        brackets: [{
            start: int,
            end: int,
            label: string}]}]}

Consumes output of spacy/munge/align_raw.py
"""
import plac
import json
from os import path
import re

from spacy.munge import read_ptb
from spacy.munge import read_conll
from spacy.munge import read_ner


def _iter_raw_files(raw_loc):
    files = json.load(open(raw_loc))
    for f in files:
        yield f


def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
    ptb_sents = read_ptb.split(ptb_text)
    dep_sents = read_conll.split(dep_text)
    ner_sents = read_ner.split(ner_text) if ner_text is not None else None

    assert len(ptb_sents) == len(dep_sents)

    i = 0
    doc = {'id': file_id, 'paragraphs': []}
    for raw_sents in raw_paras:
        para = {
            'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
            'sents': [],
            'tokens': [],
            'brackets': [],
            'entities': []}
        offset = 0
        for raw_sent in raw_sents:
            _, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)
            _, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
            if ner_sents is not None:
                _, ner = read_ner.parse(ner_sents[i], strip_bad_periods=True)
                assert len(ner) == len(annot)
            else:
                ner = ['-' for _ in annot]
            for token_id, token in enumerate(annot):
                try:
                    head = (token['head'] + offset) if token['head'] != -1 else -1
                    para['tokens'].append({
                        'id': offset + token_id,
                        'orth': token['word'],
                        'tag': token['tag'],
                        'head': head,
                        'dep': token['dep'],
                        'ner': ner[token_id]})
                except:
                    raise
            for label, start, end in brackets:
                if start != end:
                    para['brackets'].append({
                        'label': label,
                        'first': start + offset,
                        'last': (end-1) + offset})
            i += 1
            offset += len(annot)
            para['sents'].append(offset)
        doc['paragraphs'].append(para)
    return doc


def main(onto_dir, raw_dir, out_dir):
    for i in range(25):
        section = str(i) if i >= 10 else ('0' + str(i))
        raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
        docs = []
        for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
            if section == '00':
                j += 1
            if section == '04' and filename == '55':
                continue
            ptb_loc = path.join(onto_dir, section, '%s.parse' % filename)
            dep_loc = ptb_loc + '.dep'
            ner_loc = path.join(onto_dir, section, '%s.name' % filename)
            if path.exists(ptb_loc) and path.exists(dep_loc) and path.exists(ner_loc):
                docs.append(
                    format_doc(
                        filename,
                        raw_paras,
                        open(ptb_loc).read().strip(),
                        open(dep_loc).read().strip(),
                        open(ner_loc).read().strip() if path.exists(ner_loc) else None))
        with open(path.join(out_dir, '%s.json' % section), 'w') as file_:
            json.dump(docs, file_, indent=4)


if __name__ == '__main__':
    plac.call(main)
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00			`"""Convert OntoNotes into a json format.`

			`doc: {`
			`id: string,`
			`paragraphs: [{`
			`raw: string,`
			`sents: [int],`
			`tokens: [{`
			`start: int,`
			`tag: string,`
			`head: int,`
			`dep: string}],`
* Work on prepare_treebank script, adding NER to it 2015-05-26 20:28:29 +03:00			`ner: [{`
			`start: int,`
			`end: int,`
			`label: string}],`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00			`brackets: [{`
			`start: int,`
			`end: int,`
* Work on prepare_treebank script, adding NER to it 2015-05-26 20:28:29 +03:00			`label: string}]}]}`
* Fix alignment in prepare_treebank 2015-05-06 17:31:00 +03:00
			`Consumes output of spacy/munge/align_raw.py`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00			`"""`
			`import plac`
			`import json`
			`from os import path`
			`import re`

			`from spacy.munge import read_ptb`
			`from spacy.munge import read_conll`
* Work on prepare_treebank script, adding NER to it 2015-05-26 20:28:29 +03:00			`from spacy.munge import read_ner`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00

			`def _iter_raw_files(raw_loc):`
			`files = json.load(open(raw_loc))`
			`for f in files:`
			`yield f`


* Work on prepare_treebank script, adding NER to it 2015-05-26 20:28:29 +03:00			`def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):`
			`ptb_sents = read_ptb.split(ptb_text)`
			`dep_sents = read_conll.split(dep_text)`
			`ner_sents = read_ner.split(ner_text) if ner_text is not None else None`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00
			`assert len(ptb_sents) == len(dep_sents)`

			`i = 0`
* Work on prepare_treebank script, adding NER to it 2015-05-26 20:28:29 +03:00			`doc = {'id': file_id, 'paragraphs': []}`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00			`for raw_sents in raw_paras:`
* Tmp commit 2015-05-24 03:50:14 +03:00			`para = {`
			`'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),`
			`'sents': [],`
			`'tokens': [],`
* Work on prepare_treebank script, adding NER to it 2015-05-26 20:28:29 +03:00			`'brackets': [],`
			`'entities': []}`
* Fix alignment in prepare_treebank 2015-05-06 17:31:00 +03:00			`offset = 0`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00			`for raw_sent in raw_sents:`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`_, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)`
			`_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)`
* Work on prepare_treebank script, adding NER to it 2015-05-26 20:28:29 +03:00			`if ner_sents is not None:`
			`_, ner = read_ner.parse(ner_sents[i], strip_bad_periods=True)`
* In prepare_treebank, move ner into the token descriptions 2015-05-26 20:52:39 +03:00			`assert len(ner) == len(annot)`
* Work on prepare_treebank script, adding NER to it 2015-05-26 20:28:29 +03:00			`else:`
* In prepare_treebank, move ner into the token descriptions 2015-05-26 20:52:39 +03:00			`ner = ['-' for _ in annot]`
* Tmp commit 2015-05-24 03:50:14 +03:00			`for token_id, token in enumerate(annot):`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00			`try:`
* Tmp commit 2015-05-24 03:50:14 +03:00			`head = (token['head'] + offset) if token['head'] != -1 else -1`
* Fix alignment in prepare_treebank 2015-05-06 17:31:00 +03:00			`para['tokens'].append({`
* Tmp commit 2015-05-24 03:50:14 +03:00			`'id': offset + token_id,`
			`'orth': token['word'],`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00			`'tag': token['tag'],`
			`'head': head,`
* In prepare_treebank, move ner into the token descriptions 2015-05-26 20:52:39 +03:00			`'dep': token['dep'],`
			`'ner': ner[token_id]})`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00			`except:`
			`raise`
			`for label, start, end in brackets:`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`if start != end:`
* Work on prepare_treebank script, adding NER to it 2015-05-26 20:28:29 +03:00			`para['brackets'].append({`
			`'label': label,`
			`'first': start + offset,`
			`'last': (end-1) + offset})`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00			`i += 1`
* Tmp commit 2015-05-24 03:50:14 +03:00			`offset += len(annot)`
			`para['sents'].append(offset)`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00			`doc['paragraphs'].append(para)`
			`return doc`


* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`def main(onto_dir, raw_dir, out_dir):`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00			`for i in range(25):`
			`section = str(i) if i >= 10 else ('0' + str(i))`
			`raw_loc = path.join(raw_dir, 'wsj%s.json' % section)`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`docs = []`
* Tmp commit, while switch to new format that assumes alignment happens during training 2015-05-23 18:39:04 +03:00			`for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00			`if section == '00':`
			`j += 1`
			`if section == '04' and filename == '55':`
			`continue`
* Tmp commit, while switch to new format that assumes alignment happens during training 2015-05-23 18:39:04 +03:00			`ptb_loc = path.join(onto_dir, section, '%s.parse' % filename)`
			`dep_loc = ptb_loc + '.dep'`
* Work on prepare_treebank script, adding NER to it 2015-05-26 20:28:29 +03:00			`ner_loc = path.join(onto_dir, section, '%s.name' % filename)`
			`if path.exists(ptb_loc) and path.exists(dep_loc) and path.exists(ner_loc):`
			`docs.append(`
			`format_doc(`
			`filename,`
			`raw_paras,`
			`open(ptb_loc).read().strip(),`
			`open(dep_loc).read().strip(),`
			`open(ner_loc).read().strip() if path.exists(ner_loc) else None))`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`with open(path.join(out_dir, '%s.json' % section), 'w') as file_:`
* Tmp commit, while switch to new format that assumes alignment happens during training 2015-05-23 18:39:04 +03:00			`json.dump(docs, file_, indent=4)`
* Work on script to format training data as a JSON file. 2015-05-05 02:00:27 +03:00

			`if __name__ == '__main__':`
			`plac.call(main)`