* Write JSON files, with both dependency and PSG parses

2025-12-24 02:23:19 +03:00 · 2015-05-05 02:31:20 +02:00 · 2015-05-05 02:31:20 +02:00 · 0ad72a77ce
commit 0ad72a77ce
parent 5078a32213
5 changed files with 293 additions and 14 deletions
--- a/bin/prepare_treebank.py
+++ b/bin/prepare_treebank.py
@ -60,14 +60,11 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
                    'brackets': []}
        for raw_sent in raw_sents:
            para['sents'].append(offset) 
-            _, brackets = read_ptb.parse(ptb_sents[i])
+            _, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)
-            _, annot = read_conll.parse(dep_sents[i])
+            _, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
            indices, word_idx, offset = _get_word_indices(raw_sent, 0, offset)
            for token in annot:
                if token['head'] == -1:
                    head = indices[token['id']]
                else:
                head = indices[token['head']]
                try:
                    para['tokens'].append({'start': indices[token['id']],
@ -80,6 +77,7 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
                    print raw_sent
                    raise
            for label, start, end in brackets:
                if start != end:
                    para['brackets'].append({'label': label,
                        'start': indices[start],
                        'end': indices[end-1]})
@ -88,24 +86,25 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
    return doc
-def main(onto_dir, raw_dir, out_loc):
+def main(onto_dir, raw_dir, out_dir):
    docs = []
    for i in range(25):
        section = str(i) if i >= 10 else ('0' + str(i))
        raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
        docs = []
        for j, raw_paras in enumerate(_iter_raw_files(raw_loc)):
            if section == '00':
                j += 1
            filename = str(j) if j >= 9 else ('0' + str(j))
            if section == '04' and filename == '55':
                continue
-            ptb_loc = path.join(onto_dir, section, 'wsj_%s%s.parse' % (section, filename))
+            ptb_loc = path.join(onto_dir, section, 'wsj_%s%s.mrg' % (section, filename))
-            dep_loc = ptb_loc + '.dep'
+            dep_loc = ptb_loc + '.3.pa.gs.tab'
            if path.exists(ptb_loc) and path.exists(dep_loc):
                print ptb_loc
                doc = format_doc(section, filename, raw_paras, ptb_loc, dep_loc)
                docs.append(doc)
-    json.dump(docs, open(out_loc, 'w'))
+        with open(path.join(out_dir, '%s.json' % section), 'w') as file_:
            json.dump(docs, file_)
 if __name__ == '__main__':
--- a/spacy/munge/init.py
+++ b/spacy/munge/init.py
--- a/spacy/munge/align_raw.py
+++ b/spacy/munge/align_raw.py
@ -0,0 +1,175 @@
 """Align the raw sentences from Read et al (2012) to the PTB tokenization,
 outputing the format:
 [{
    section: int,
    file: string,
    paragraphs: [{
        raw: string,
        segmented: string,
        tokens: [int]}]}]
 """
 import plac
 from pathlib import Path
 import json
 from os import path
 from spacy.munge import read_ptb
 def read_unsegmented(section_loc):
    # Arbitrary patches applied to the _raw_ text to promote alignment.
    patches = (
        ('. . . .', '...'),
        ('....', '...'),
        ('Co..', 'Co.'),
        ("`", "'"),
    )
    paragraphs = []
    with open(section_loc) as file_:
        para = []
        for line in file_:
            if line.startswith('['):
                line = line.split('|', 1)[1].strip()
                for find, replace in patches:
                    line = line.replace(find, replace)
                para.append(line)
            else:
                paragraphs.append(para)
                para = []
        paragraphs.append(para)
    return paragraphs
 def read_ptb_sec(ptb_sec_dir):
    ptb_sec_dir = Path(ptb_sec_dir)
    files = []
    for loc in ptb_sec_dir.iterdir():
        if not str(loc).endswith('parse') and not str(loc).endswith('mrg'):
            continue
        with loc.open() as file_:
            text = file_.read()
        sents = []
        for parse_str in read_ptb.split(text):
            words, brackets = read_ptb.parse(parse_str, strip_bad_periods=True)
            words = [_reform_ptb_word(word) for word in words]
            string = ' '.join(words)
            sents.append(string)
        files.append(sents)
    return files
 def _reform_ptb_word(tok):
    tok = tok.replace("``", '"')
    tok = tok.replace("`", "'")
    tok = tok.replace("''", '"')
    tok = tok.replace('\\', '')
    tok = tok.replace('-LCB-', '{')
    tok = tok.replace('-RCB-', '}')
    tok = tok.replace('-RRB-', ')')
    tok = tok.replace('-LRB-', '(')
    tok = tok.replace("'T-", "'T")
    return tok
 def get_alignment(raw_by_para, ptb_by_file):
    # These are list-of-lists, by paragraph and file respectively.
    # Flatten them into a list of (outer_id, inner_id, item) triples
    raw_sents = _flatten(raw_by_para)
    ptb_sents = _flatten(ptb_by_file)
    assert len(raw_sents) == len(ptb_sents)
    output = []
    for (p_id, p_sent_id, raw), (f_id, f_sent_id, ptb) in zip(raw_sents, ptb_sents):
        alignment = align_chars(raw, ptb)
        sepped = []
        for i, c in enumerate(ptb):
            if alignment[i] is False:
                sepped.append('<SEP>')
            else:
                sepped.append(c)
        output.append((f_id, p_id, f_sent_id, ''.join(sepped)))
    return output
 def _flatten(nested):
    flat = []
    for id1, inner in enumerate(nested):
        flat.extend((id1, id2, item) for id2, item in enumerate(inner))
    return flat
 def align_chars(raw, ptb):
    i = 0
    j = 0
    length = len(raw)
    alignment = [False for _ in range(len(ptb))]
    while i < length:
        if raw[i] == ' ' and ptb[j] == ' ':
            alignment[j] = True
            i += 1
            j += 1
        elif raw[i] == ' ':
            i += 1
        elif ptb[j] == ' ':
            j += 1
        assert raw[i].lower() == ptb[j].lower(), raw[i:1]
        alignment[j] = i
        i += 1; j += 1
    return alignment
 def group_into_files(sents):
    last_id = 0
    this = []
    output = []
    for f_id, p_id, s_id, sent in sents:
        if f_id != last_id:
            output.append(this)
            this = []
        this.append((f_id, p_id, s_id, sent))
        last_id = f_id
    if this:
        output.append(this)
    return output
 def group_into_paras(sents):
    last_id = 0
    this = []
    output = []
    for f_id, p_id, s_id, sent in sents:
        if p_id != last_id and this:
            output.append(this)
            this = []
        this.append((sent))
        last_id = p_id
    if this:
        output.append(this)
    return output
 def get_sections(odc_dir, ptb_dir, out_dir):
    for i in range(25):
        section = str(i) if i >= 10 else ('0' + str(i))
        odc_loc = path.join(odc_dir, 'wsj%s.txt' % section)
        ptb_sec = path.join(ptb_dir, section)
        out_loc = path.join(out_dir, 'wsj%s.json' % section)
        yield odc_loc, ptb_sec, out_loc
 def main(odc_dir, ptb_dir, out_dir):
    for odc_loc, ptb_sec_dir, out_loc in get_sections(odc_dir, ptb_dir, out_dir):
        raw_paragraphs = read_unsegmented(odc_loc)
        ptb_files = read_ptb_sec(ptb_sec_dir)
        aligned = get_alignment(raw_paragraphs, ptb_files)
        files = [group_into_paras(f) for f in group_into_files(aligned)]
        with open(out_loc, 'w') as file_:
            json.dump(files, file_)
 if __name__ == '__main__':
    plac.call(main)
--- a/spacy/munge/read_conll.py
+++ b/spacy/munge/read_conll.py
@ -0,0 +1,40 @@
 from __future__ import unicode_literals
 def split(text):
    return [sent.strip() for sent in text.split('\n\n') if sent.strip()]
 def parse(sent_text, strip_bad_periods=False):
    sent_text = sent_text.strip()
    assert sent_text
    annot = []
    words = []
    i = 0
    for line in sent_text.split('\n'):
        word, tag, head, dep = line.split()
        if strip_bad_periods and words and _is_bad_period(words[-1], word):
            continue
        annot.append({
            'id': i,
            'word': word,
            'tag': tag,
            'head': int(head) - 1 if int(head) != 0 else i,
            'dep': dep})
        words.append(word)
        i += 1
    return words, annot
 def _is_bad_period(prev, period):
    if period != '.':
        return False
    elif prev == '.':
        return False
    elif not prev.endswith('.'):
        return False
    else:
        return True
--- a/spacy/munge/read_ptb.py
+++ b/spacy/munge/read_ptb.py
@ -0,0 +1,65 @@
 import re
 import os
 from os import path
 def parse(sent_text, strip_bad_periods=False):
    sent_text = sent_text.strip()
    assert sent_text and sent_text.startswith('(')
    open_brackets = []
    brackets = []
    bracketsRE = re.compile(r'(\()([^\s\)\(]+)|([^\s\)\(]+)?(\))')
    word_i = 0
    words = []
    # Remove outermost bracket
    if sent_text.startswith('(('):
        sent_text = sent_text.replace('((', '( (', 1)
    for match in bracketsRE.finditer(sent_text[2:-1]):
        open_, label, text, close = match.groups()
        if open_:
            assert not close
            assert label.strip()
            open_brackets.append((label, word_i))
        else:
            assert close
            label, start = open_brackets.pop()
            assert label.strip()
            if strip_bad_periods and words and _is_bad_period(words[-1], text):
                continue
            # Traces leave 0-width bracket, but no token
            if text and label != '-NONE-':
                words.append(text)
                word_i += 1
            else:
                brackets.append((label, start, word_i))
    return words, brackets
 def _is_bad_period(prev, period):
    if period != '.':
        return False
    elif prev == '.':
        return False
    elif not prev.endswith('.'):
        return False
    else:
        return True
 def split(text):
    sentences = []
    current = []
    for line in text.strip().split('\n'):
        line = line.rstrip()
        if not line:
            continue
        # Detect the start of sentences by line starting with (
        # This is messy, but it keeps bracket parsing at the sentence level
        if line.startswith('(') and current:
            sentences.append('\n'.join(current))
            current = []
        current.append(line)
    if current:
        sentences.append('\n'.join(current))
    return sentences