spaCy/spacy/munge/align_raw.py

"""Align the raw sentences from Read et al (2012) to the PTB tokenization,
outputting as a .json file. Used in bin/prepare_treebank.py
"""
import plac
from pathlib import Path
import json
from os import path
import os

from spacy.munge import read_ptb
from spacy.munge.read_ontonotes import sgml_extract


def read_odc(section_loc):
    # Arbitrary patches applied to the _raw_ text to promote alignment.
    patches = (
        ('. . . .', '...'),
        ('....', '...'),
        ('Co..', 'Co.'),
        ("`", "'"),
        # OntoNotes specific
        (" S$", " US$"),
        ("Showtime or a sister service", "Showtime or a service"),
        ("The hotel and gaming company", "The hotel and Gaming company"),
        ("I'm-coming-down-your-throat", "I-'m coming-down-your-throat"),
    )
    
    paragraphs = []
    with open(section_loc) as file_:
        para = []
        for line in file_:
            if line.startswith('['):
                line = line.split('|', 1)[1].strip()
                for find, replace in patches:
                    line = line.replace(find, replace)
                para.append(line)
            else:
                paragraphs.append(para)
                para = []
        paragraphs.append(para)
    return paragraphs


def read_ptb_sec(ptb_sec_dir):
    ptb_sec_dir = Path(ptb_sec_dir)
    files = []
    for loc in ptb_sec_dir.iterdir():
        if not str(loc).endswith('parse') and not str(loc).endswith('mrg'):
            continue
        filename = loc.parts[-1].split('.')[0]
        with loc.open() as file_:
            text = file_.read()
        sents = []
        for parse_str in read_ptb.split(text):
            words, brackets = read_ptb.parse(parse_str, strip_bad_periods=True)
            words = [_reform_ptb_word(word) for word in words]
            string = ' '.join(words)
            sents.append((filename, string))
        files.append(sents)
    return files


def _reform_ptb_word(tok):
    tok = tok.replace("``", '"')
    tok = tok.replace("`", "'")
    tok = tok.replace("''", '"')
    tok = tok.replace('\\', '')
    tok = tok.replace('-LCB-', '{')
    tok = tok.replace('-RCB-', '}')
    tok = tok.replace('-RRB-', ')')
    tok = tok.replace('-LRB-', '(')
    tok = tok.replace("'T-", "'T")
    return tok
 

def get_alignment(raw_by_para, ptb_by_file):
    # These are list-of-lists, by paragraph and file respectively.
    # Flatten them into a list of (outer_id, inner_id, item) triples
    raw_sents = _flatten(raw_by_para)
    ptb_sents = list(_flatten(ptb_by_file))

    output = []
    ptb_idx = 0
    n_skipped = 0
    skips = []
    for (p_id, p_sent_id, raw) in raw_sents:
        #print raw
        if ptb_idx >= len(ptb_sents):
            n_skipped += 1
            continue
        f_id, f_sent_id, (ptb_id, ptb) = ptb_sents[ptb_idx]
        alignment = align_chars(raw, ptb)
        if not alignment:
            skips.append((ptb, raw))
            n_skipped += 1
            continue
        ptb_idx += 1
        sepped = []
        for i, c in enumerate(ptb):
            if alignment[i] is False:
                sepped.append('<SEP>')
            else:
                sepped.append(c)
        output.append((f_id, p_id, f_sent_id, (ptb_id, ''.join(sepped))))
    if n_skipped + len(ptb_sents) != len(raw_sents):
        for ptb, raw in skips:
            print ptb
            print raw
        raise Exception
    return output


def _flatten(nested):
    flat = []
    for id1, inner in enumerate(nested):
        flat.extend((id1, id2, item) for id2, item in enumerate(inner))
    return flat


def align_chars(raw, ptb):
    if raw.replace(' ', '') != ptb.replace(' ', ''):
        return None
    i = 0
    j = 0

    length = len(raw)
    alignment = [False for _ in range(len(ptb))]
    while i < length:
        if raw[i] == ' ' and ptb[j] == ' ':
            alignment[j] = True
            i += 1
            j += 1
        elif raw[i] == ' ':
            i += 1
        elif ptb[j] == ' ':
            j += 1
        assert raw[i].lower() == ptb[j].lower(), raw[i:1]
        alignment[j] = i
        i += 1; j += 1
    return alignment


def group_into_files(sents):
    last_id = 0
    last_fn = None
    this = []
    output = []
    for f_id, p_id, s_id, (filename, sent) in sents:
        if f_id != last_id:
            assert last_fn is not None
            output.append((last_fn, this))
            this = []
        last_fn = filename
        this.append((f_id, p_id, s_id, sent))
        last_id = f_id
    if this:
        assert last_fn is not None
        output.append((last_fn, this))
    return output


def group_into_paras(sents):
    last_id = 0
    this = []
    output = []
    for f_id, p_id, s_id, sent in sents:
        if p_id != last_id and this:
            output.append(this)
            this = []
        this.append(sent)
        last_id = p_id
    if this:
        output.append(this)
    return output


def get_sections(odc_dir, ptb_dir, out_dir):
    for i in range(25):
        section = str(i) if i >= 10 else ('0' + str(i))
        odc_loc = path.join(odc_dir, 'wsj%s.txt' % section)
        ptb_sec = path.join(ptb_dir, section)
        out_loc = path.join(out_dir, 'wsj%s.json' % section)
        yield odc_loc, ptb_sec, out_loc


def do_wsj(odc_dir, ptb_dir, out_dir):
    for odc_loc, ptb_sec_dir, out_loc in get_sections(odc_dir, ptb_dir, out_dir):
        raw_paragraphs = read_odc(odc_loc)
        ptb_files = read_ptb_sec(ptb_sec_dir)
        aligned = get_alignment(raw_paragraphs, ptb_files)
        files = [(fn, group_into_paras(sents))
                 for fn, sents in group_into_files(aligned)]
        with open(out_loc, 'w') as file_:
            json.dump(files, file_)


def do_web(src_dir, onto_dir, out_dir):
    mapping = dict(line.split() for line in open(path.join(onto_dir, 'map.txt'))
                   if len(line.split()) == 2)
    for annot_fn, src_fn in mapping.items():
        if not annot_fn.startswith('eng'):
            continue

        ptb_loc = path.join(onto_dir, annot_fn + '.parse') 
        src_loc = path.join(src_dir, src_fn + '.sgm')

        if path.exists(ptb_loc) and path.exists(src_loc):
            src_doc = sgml_extract(open(src_loc).read())
            ptb_doc = [read_ptb.parse(parse_str, strip_bad_periods=True)[0]
                       for parse_str in read_ptb.split(open(ptb_loc).read())]
            print 'Found'
        else:
            print 'Miss'


def may_mkdir(parent, *subdirs):
    if not path.exists(parent):
        os.mkdir(parent)
    for i in range(1, len(subdirs)):
        directories = (parent,) + subdirs[:i]
        subdir = path.join(*directories)
        if not path.exists(subdir):
            os.mkdir(subdir)


def main(odc_dir, onto_dir, out_dir):
    may_mkdir(out_dir, 'wsj', 'align')
    may_mkdir(out_dir, 'web', 'align')
    #do_wsj(odc_dir, path.join(ontonotes_dir, 'wsj', 'orig'),
    #       path.join(out_dir, 'wsj', 'align'))
    do_web(
        path.join(onto_dir, 'data', 'english', 'metadata', 'context', 'wb', 'sel'),
        path.join(onto_dir, 'data', 'english', 'annotations', 'wb'),
        path.join(out_dir, 'web', 'align'))


if __name__ == '__main__':
    plac.call(main)
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`"""Align the raw sentences from Read et al (2012) to the PTB tokenization,`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`outputting as a .json file. Used in bin/prepare_treebank.py`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`"""`
			`import plac`
			`from pathlib import Path`
			`import json`
			`from os import path`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`import os`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00
			`from spacy.munge import read_ptb`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`from spacy.munge.read_ontonotes import sgml_extract`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00

* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`def read_odc(section_loc):`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`# Arbitrary patches applied to the _raw_ text to promote alignment.`
			`patches = (`
			`('. . . .', '...'),`
			`('....', '...'),`
			`('Co..', 'Co.'),`
			("`", "'"),
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`# OntoNotes specific`
			`(" S$", " US$"),`
			`("Showtime or a sister service", "Showtime or a service"),`
			`("The hotel and gaming company", "The hotel and Gaming company"),`
			`("I'm-coming-down-your-throat", "I-'m coming-down-your-throat"),`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`)`

			`paragraphs = []`
			`with open(section_loc) as file_:`
			`para = []`
			`for line in file_:`
			`if line.startswith('['):`
			`line = line.split('\|', 1)[1].strip()`
			`for find, replace in patches:`
			`line = line.replace(find, replace)`
			`para.append(line)`
			`else:`
			`paragraphs.append(para)`
			`para = []`
			`paragraphs.append(para)`
			`return paragraphs`


			`def read_ptb_sec(ptb_sec_dir):`
			`ptb_sec_dir = Path(ptb_sec_dir)`
			`files = []`
			`for loc in ptb_sec_dir.iterdir():`
			`if not str(loc).endswith('parse') and not str(loc).endswith('mrg'):`
			`continue`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`filename = loc.parts[-1].split('.')[0]`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`with loc.open() as file_:`
			`text = file_.read()`
			`sents = []`
			`for parse_str in read_ptb.split(text):`
			`words, brackets = read_ptb.parse(parse_str, strip_bad_periods=True)`
			`words = [_reform_ptb_word(word) for word in words]`
			`string = ' '.join(words)`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`sents.append((filename, string))`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`files.append(sents)`
			`return files`


			`def _reform_ptb_word(tok):`
			tok = tok.replace("``", '"')
			tok = tok.replace("`", "'")
			`tok = tok.replace("''", '"')`
			`tok = tok.replace('\\', '')`
			`tok = tok.replace('-LCB-', '{')`
			`tok = tok.replace('-RCB-', '}')`
			`tok = tok.replace('-RRB-', ')')`
			`tok = tok.replace('-LRB-', '(')`
			`tok = tok.replace("'T-", "'T")`
			`return tok`


			`def get_alignment(raw_by_para, ptb_by_file):`
			`# These are list-of-lists, by paragraph and file respectively.`
			`# Flatten them into a list of (outer_id, inner_id, item) triples`
			`raw_sents = _flatten(raw_by_para)`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`ptb_sents = list(_flatten(ptb_by_file))`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00
			`output = []`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`ptb_idx = 0`
			`n_skipped = 0`
			`skips = []`
			`for (p_id, p_sent_id, raw) in raw_sents:`
			`#print raw`
			`if ptb_idx >= len(ptb_sents):`
			`n_skipped += 1`
			`continue`
			`f_id, f_sent_id, (ptb_id, ptb) = ptb_sents[ptb_idx]`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`alignment = align_chars(raw, ptb)`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`if not alignment:`
			`skips.append((ptb, raw))`
			`n_skipped += 1`
			`continue`
			`ptb_idx += 1`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`sepped = []`
			`for i, c in enumerate(ptb):`
			`if alignment[i] is False:`
			`sepped.append('<SEP>')`
			`else:`
			`sepped.append(c)`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`output.append((f_id, p_id, f_sent_id, (ptb_id, ''.join(sepped))))`
			`if n_skipped + len(ptb_sents) != len(raw_sents):`
			`for ptb, raw in skips:`
			`print ptb`
			`print raw`
			`raise Exception`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`return output`


			`def _flatten(nested):`
			`flat = []`
			`for id1, inner in enumerate(nested):`
			`flat.extend((id1, id2, item) for id2, item in enumerate(inner))`
			`return flat`


			`def align_chars(raw, ptb):`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`if raw.replace(' ', '') != ptb.replace(' ', ''):`
			`return None`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`i = 0`
			`j = 0`

			`length = len(raw)`
			`alignment = [False for _ in range(len(ptb))]`
			`while i < length:`
			`if raw[i] == ' ' and ptb[j] == ' ':`
			`alignment[j] = True`
			`i += 1`
			`j += 1`
			`elif raw[i] == ' ':`
			`i += 1`
			`elif ptb[j] == ' ':`
			`j += 1`
			`assert raw[i].lower() == ptb[j].lower(), raw[i:1]`
			`alignment[j] = i`
			`i += 1; j += 1`
			`return alignment`


			`def group_into_files(sents):`
			`last_id = 0`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`last_fn = None`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`this = []`
			`output = []`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`for f_id, p_id, s_id, (filename, sent) in sents:`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`if f_id != last_id:`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`assert last_fn is not None`
			`output.append((last_fn, this))`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`this = []`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`last_fn = filename`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`this.append((f_id, p_id, s_id, sent))`
			`last_id = f_id`
			`if this:`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`assert last_fn is not None`
			`output.append((last_fn, this))`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`return output`


			`def group_into_paras(sents):`
			`last_id = 0`
			`this = []`
			`output = []`
			`for f_id, p_id, s_id, sent in sents:`
			`if p_id != last_id and this:`
			`output.append(this)`
			`this = []`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`this.append(sent)`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`last_id = p_id`
			`if this:`
			`output.append(this)`
			`return output`


			`def get_sections(odc_dir, ptb_dir, out_dir):`
			`for i in range(25):`
			`section = str(i) if i >= 10 else ('0' + str(i))`
			`odc_loc = path.join(odc_dir, 'wsj%s.txt' % section)`
			`ptb_sec = path.join(ptb_dir, section)`
			`out_loc = path.join(out_dir, 'wsj%s.json' % section)`
			`yield odc_loc, ptb_sec, out_loc`


* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`def do_wsj(odc_dir, ptb_dir, out_dir):`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`for odc_loc, ptb_sec_dir, out_loc in get_sections(odc_dir, ptb_dir, out_dir):`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`raw_paragraphs = read_odc(odc_loc)`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`ptb_files = read_ptb_sec(ptb_sec_dir)`
			`aligned = get_alignment(raw_paragraphs, ptb_files)`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`files = [(fn, group_into_paras(sents))`
			`for fn, sents in group_into_files(aligned)]`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`with open(out_loc, 'w') as file_:`
			`json.dump(files, file_)`


* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`def do_web(src_dir, onto_dir, out_dir):`
			`mapping = dict(line.split() for line in open(path.join(onto_dir, 'map.txt'))`
			`if len(line.split()) == 2)`
			`for annot_fn, src_fn in mapping.items():`
			`if not annot_fn.startswith('eng'):`
			`continue`

			`ptb_loc = path.join(onto_dir, annot_fn + '.parse')`
			`src_loc = path.join(src_dir, src_fn + '.sgm')`

			`if path.exists(ptb_loc) and path.exists(src_loc):`
			`src_doc = sgml_extract(open(src_loc).read())`
			`ptb_doc = [read_ptb.parse(parse_str, strip_bad_periods=True)[0]`
			`for parse_str in read_ptb.split(open(ptb_loc).read())]`
			`print 'Found'`
			`else:`
			`print 'Miss'`


			`def may_mkdir(parent, *subdirs):`
			`if not path.exists(parent):`
			`os.mkdir(parent)`
			`for i in range(1, len(subdirs)):`
			`directories = (parent,) + subdirs[:i]`
			`subdir = path.join(*directories)`
			`if not path.exists(subdir):`
			`os.mkdir(subdir)`


			`def main(odc_dir, onto_dir, out_dir):`
			`may_mkdir(out_dir, 'wsj', 'align')`
			`may_mkdir(out_dir, 'web', 'align')`
			`#do_wsj(odc_dir, path.join(ontonotes_dir, 'wsj', 'orig'),`
			`# path.join(out_dir, 'wsj', 'align'))`
			`do_web(`
			`path.join(onto_dir, 'data', 'english', 'metadata', 'context', 'wb', 'sel'),`
			`path.join(onto_dir, 'data', 'english', 'annotations', 'wb'),`
			`path.join(out_dir, 'web', 'align'))`



* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`if __name__ == '__main__':`
			`plac.call(main)`