implemented "break a tie" request by Andres Riancho

This commit is contained in:
Miroslav Stampar 2011-04-20 08:35:47 +00:00
parent df0331fe9b
commit 24435a2c20

View File

@ -165,18 +165,23 @@ def decodePage(page, contentEncoding, contentType):
page = data.read() page = data.read()
charset = None httpCharset, metaCharset = None, None
# http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode # http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
if contentType and (contentType.find('charset=') != -1): if contentType and (contentType.find('charset=') != -1):
charset = contentType.split('charset=')[-1] httpCharset = checkCharEncoding(contentType.split('charset=')[-1])
elif extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE):
charset = extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE)
kb.pageEncoding = checkCharEncoding(charset) or getHeuristicCharEncoding(page) metaCharset = checkCharEncoding(extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE))
if (httpCharset or metaCharset and not all([httpCharset, metaCharset]))\
or (httpCharset == metaCharset and all([httpCharset, metaCharset])):
kb.pageEncoding = httpCharset or metaCharset
else:
kb.pageEncoding = None
if contentType and any(map(lambda x: x in contentType.lower(), ('text/txt', 'text/raw', 'text/html', 'text/xml'))): if contentType and any(map(lambda x: x in contentType.lower(), ('text/txt', 'text/raw', 'text/html', 'text/xml'))):
# can't do for all responses because we need to support binary files too # can't do for all responses because we need to support binary files too
kb.pageEncoding = kb.pageEncoding or getHeuristicCharEncoding(page)
page = getUnicode(page, kb.pageEncoding) page = getUnicode(page, kb.pageEncoding)
return page return page