spaCy/spacy/munge/read_ontonotes.py

import re


docid_re = re.compile(r'<DOCID>([^>]+)</DOCID>')
doctype_re = re.compile(r'<DOCTYPE SOURCE="[^"]+">([^>]+)</DOCTYPE>')
datetime_re = re.compile(r'<DATETIME>([^>]+)</DATETIME>')
headline_re = re.compile(r'<HEADLINE>(.+)</HEADLINE>', re.DOTALL)
post_re = re.compile(r'<POST>(.+)</POST>', re.DOTALL)
poster_re = re.compile(r'<POSTER>(.+)</POSTER>')
postdate_re = re.compile(r'<POSTDATE>(.+)</POSTDATE>')
tag_re = re.compile(r'<[^>]+>[^>]+</[^>]+>')


def sgml_extract(text_data):
    """Extract text from the OntoNotes web documents.

    Format:
    [{
        docid: string,
        doctype: string,
        datetime: string,
        poster: string,
        postdate: string
        text: [string]
    }]
    """
    return {
        'docid': _get_one(docid_re, text_data, required=True),
        'doctype': _get_one(doctype_re, text_data, required=True),
        'datetime': _get_one(datetime_re, text_data, required=True),
        'headline': _get_one(headline_re, text_data, required=True),
        'poster': _get_one(poster_re, _get_one(post_re, text_data)),
        'postdate': _get_one(postdate_re, _get_one(post_re, text_data)),
        'text': _get_text(_get_one(post_re, text_data)).strip()
    }


def _get_one(regex, text, required=False):
    matches = regex.search(text)
    if not matches and not required:
        return ''
    assert len(matches.groups()) == 1, matches
    return matches.groups()[0].strip()


def _get_text(data):
    return tag_re.sub('', data).replace('<P>', '').replace('</P>', '')
* Add script to read OntoNotes source documents 2015-05-24 22:49:58 +03:00			`import re`


			`docid_re = re.compile(r'<DOCID>([^>]+)</DOCID>')`
			`doctype_re = re.compile(r'<DOCTYPE SOURCE="[^"]+">([^>]+)</DOCTYPE>')`
			`datetime_re = re.compile(r'<DATETIME>([^>]+)</DATETIME>')`
			`headline_re = re.compile(r'<HEADLINE>(.+)</HEADLINE>', re.DOTALL)`
			`post_re = re.compile(r'<POST>(.+)</POST>', re.DOTALL)`
			`poster_re = re.compile(r'<POSTER>(.+)</POSTER>')`
			`postdate_re = re.compile(r'<POSTDATE>(.+)</POSTDATE>')`
			`tag_re = re.compile(r'<[^>]+>[^>]+</[^>]+>')`


			`def sgml_extract(text_data):`
			`"""Extract text from the OntoNotes web documents.`

			`Format:`
			`[{`
			`docid: string,`
			`doctype: string,`
			`datetime: string,`
			`poster: string,`
			`postdate: string`
			`text: [string]`
			`}]`
			`"""`
			`return {`
			`'docid': _get_one(docid_re, text_data, required=True),`
			`'doctype': _get_one(doctype_re, text_data, required=True),`
			`'datetime': _get_one(datetime_re, text_data, required=True),`
			`'headline': _get_one(headline_re, text_data, required=True),`
			`'poster': _get_one(poster_re, _get_one(post_re, text_data)),`
			`'postdate': _get_one(postdate_re, _get_one(post_re, text_data)),`
			`'text': _get_text(_get_one(post_re, text_data)).strip()`
			`}`


			`def _get_one(regex, text, required=False):`
			`matches = regex.search(text)`
			`if not matches and not required:`
			`return ''`
			`assert len(matches.groups()) == 1, matches`
			`return matches.groups()[0].strip()`


			`def _get_text(data):`
			`return tag_re.sub('', data).replace('<P>', '').replace('</P>', '')`