import re docid_re = re.compile(r'([^>]+)') doctype_re = re.compile(r'([^>]+)') datetime_re = re.compile(r'([^>]+)') headline_re = re.compile(r'(.+)', re.DOTALL) post_re = re.compile(r'(.+)', re.DOTALL) poster_re = re.compile(r'(.+)') postdate_re = re.compile(r'(.+)') tag_re = re.compile(r'<[^>]+>[^>]+]+>') def sgml_extract(text_data): """Extract text from the OntoNotes web documents. Format: [{ docid: string, doctype: string, datetime: string, poster: string, postdate: string text: [string] }] """ return { 'docid': _get_one(docid_re, text_data, required=True), 'doctype': _get_one(doctype_re, text_data, required=True), 'datetime': _get_one(datetime_re, text_data, required=True), 'headline': _get_one(headline_re, text_data, required=True), 'poster': _get_one(poster_re, _get_one(post_re, text_data)), 'postdate': _get_one(postdate_re, _get_one(post_re, text_data)), 'text': _get_text(_get_one(post_re, text_data)).strip() } def _get_one(regex, text, required=False): matches = regex.search(text) if not matches and not required: return '' assert len(matches.groups()) == 1, matches return matches.groups()[0].strip() def _get_text(data): return tag_re.sub('', data).replace('

', '').replace('

', '')