mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Add script to read OntoNotes source documents
This commit is contained in:
parent
13a8595a4b
commit
744f06abf5
47
spacy/munge/read_ontonotes.py
Normal file
47
spacy/munge/read_ontonotes.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
docid_re = re.compile(r'<DOCID>([^>]+)</DOCID>')
|
||||||
|
doctype_re = re.compile(r'<DOCTYPE SOURCE="[^"]+">([^>]+)</DOCTYPE>')
|
||||||
|
datetime_re = re.compile(r'<DATETIME>([^>]+)</DATETIME>')
|
||||||
|
headline_re = re.compile(r'<HEADLINE>(.+)</HEADLINE>', re.DOTALL)
|
||||||
|
post_re = re.compile(r'<POST>(.+)</POST>', re.DOTALL)
|
||||||
|
poster_re = re.compile(r'<POSTER>(.+)</POSTER>')
|
||||||
|
postdate_re = re.compile(r'<POSTDATE>(.+)</POSTDATE>')
|
||||||
|
tag_re = re.compile(r'<[^>]+>[^>]+</[^>]+>')
|
||||||
|
|
||||||
|
|
||||||
|
def sgml_extract(text_data):
|
||||||
|
"""Extract text from the OntoNotes web documents.
|
||||||
|
|
||||||
|
Format:
|
||||||
|
[{
|
||||||
|
docid: string,
|
||||||
|
doctype: string,
|
||||||
|
datetime: string,
|
||||||
|
poster: string,
|
||||||
|
postdate: string
|
||||||
|
text: [string]
|
||||||
|
}]
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
'docid': _get_one(docid_re, text_data, required=True),
|
||||||
|
'doctype': _get_one(doctype_re, text_data, required=True),
|
||||||
|
'datetime': _get_one(datetime_re, text_data, required=True),
|
||||||
|
'headline': _get_one(headline_re, text_data, required=True),
|
||||||
|
'poster': _get_one(poster_re, _get_one(post_re, text_data)),
|
||||||
|
'postdate': _get_one(postdate_re, _get_one(post_re, text_data)),
|
||||||
|
'text': _get_text(_get_one(post_re, text_data)).strip()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_one(regex, text, required=False):
|
||||||
|
matches = regex.search(text)
|
||||||
|
if not matches and not required:
|
||||||
|
return ''
|
||||||
|
assert len(matches.groups()) == 1, matches
|
||||||
|
return matches.groups()[0].strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _get_text(data):
|
||||||
|
return tag_re.sub('', data).replace('<P>', '').replace('</P>', '')
|
Loading…
Reference in New Issue
Block a user