import re
docid_re = re.compile(r'([^>]+)')
doctype_re = re.compile(r'([^>]+)')
datetime_re = re.compile(r'([^>]+)')
headline_re = re.compile(r'(.+)', re.DOTALL)
post_re = re.compile(r'(.+)', re.DOTALL)
poster_re = re.compile(r'(.+)')
postdate_re = re.compile(r'(.+)')
tag_re = re.compile(r'<[^>]+>[^>]+[^>]+>')
def sgml_extract(text_data):
    """Extract text from the OntoNotes web documents.
    Format:
    [{
        docid: string,
        doctype: string,
        datetime: string,
        poster: string,
        postdate: string
        text: [string]
    }]
    """
    return {
        'docid': _get_one(docid_re, text_data, required=True),
        'doctype': _get_one(doctype_re, text_data, required=True),
        'datetime': _get_one(datetime_re, text_data, required=True),
        'headline': _get_one(headline_re, text_data, required=True),
        'poster': _get_one(poster_re, _get_one(post_re, text_data)),
        'postdate': _get_one(postdate_re, _get_one(post_re, text_data)),
        'text': _get_text(_get_one(post_re, text_data)).strip()
    }
def _get_one(regex, text, required=False):
    matches = regex.search(text)
    if not matches and not required:
        return ''
    assert len(matches.groups()) == 1, matches
    return matches.groups()[0].strip()
def _get_text(data):
    return tag_re.sub('', data).replace('
', '').replace('
', '')