mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			48 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			48 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import re
 | 
						|
 | 
						|
 | 
						|
docid_re = re.compile(r'<DOCID>([^>]+)</DOCID>')
 | 
						|
doctype_re = re.compile(r'<DOCTYPE SOURCE="[^"]+">([^>]+)</DOCTYPE>')
 | 
						|
datetime_re = re.compile(r'<DATETIME>([^>]+)</DATETIME>')
 | 
						|
headline_re = re.compile(r'<HEADLINE>(.+)</HEADLINE>', re.DOTALL)
 | 
						|
post_re = re.compile(r'<POST>(.+)</POST>', re.DOTALL)
 | 
						|
poster_re = re.compile(r'<POSTER>(.+)</POSTER>')
 | 
						|
postdate_re = re.compile(r'<POSTDATE>(.+)</POSTDATE>')
 | 
						|
tag_re = re.compile(r'<[^>]+>[^>]+</[^>]+>')
 | 
						|
 | 
						|
 | 
						|
def sgml_extract(text_data):
 | 
						|
    """Extract text from the OntoNotes web documents.
 | 
						|
 | 
						|
    Format:
 | 
						|
    [{
 | 
						|
        docid: string,
 | 
						|
        doctype: string,
 | 
						|
        datetime: string,
 | 
						|
        poster: string,
 | 
						|
        postdate: string
 | 
						|
        text: [string]
 | 
						|
    }]
 | 
						|
    """
 | 
						|
    return {
 | 
						|
        'docid': _get_one(docid_re, text_data, required=True),
 | 
						|
        'doctype': _get_one(doctype_re, text_data, required=True),
 | 
						|
        'datetime': _get_one(datetime_re, text_data, required=True),
 | 
						|
        'headline': _get_one(headline_re, text_data, required=True),
 | 
						|
        'poster': _get_one(poster_re, _get_one(post_re, text_data)),
 | 
						|
        'postdate': _get_one(postdate_re, _get_one(post_re, text_data)),
 | 
						|
        'text': _get_text(_get_one(post_re, text_data)).strip()
 | 
						|
    }
 | 
						|
 | 
						|
 | 
						|
def _get_one(regex, text, required=False):
 | 
						|
    matches = regex.search(text)
 | 
						|
    if not matches and not required:
 | 
						|
        return ''
 | 
						|
    assert len(matches.groups()) == 1, matches
 | 
						|
    return matches.groups()[0].strip()
 | 
						|
 | 
						|
 | 
						|
def _get_text(data):
 | 
						|
    return tag_re.sub('', data).replace('<P>', '').replace('</P>', '')
 |