From 24435a2c20179bf822dd3369ff86b4666b8160d7 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 20 Apr 2011 08:35:47 +0000 Subject: [PATCH] implemented "break a tie" request by Andres Riancho --- lib/request/basic.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/lib/request/basic.py b/lib/request/basic.py index 6984f8fbc..3133fa5bf 100644 --- a/lib/request/basic.py +++ b/lib/request/basic.py @@ -165,18 +165,23 @@ def decodePage(page, contentEncoding, contentType): page = data.read() - charset = None + httpCharset, metaCharset = None, None # http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode if contentType and (contentType.find('charset=') != -1): - charset = contentType.split('charset=')[-1] - elif extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE): - charset = extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE) + httpCharset = checkCharEncoding(contentType.split('charset=')[-1]) - kb.pageEncoding = checkCharEncoding(charset) or getHeuristicCharEncoding(page) + metaCharset = checkCharEncoding(extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE)) + + if (httpCharset or metaCharset and not all([httpCharset, metaCharset]))\ + or (httpCharset == metaCharset and all([httpCharset, metaCharset])): + kb.pageEncoding = httpCharset or metaCharset + else: + kb.pageEncoding = None if contentType and any(map(lambda x: x in contentType.lower(), ('text/txt', 'text/raw', 'text/html', 'text/xml'))): # can't do for all responses because we need to support binary files too + kb.pageEncoding = kb.pageEncoding or getHeuristicCharEncoding(page) page = getUnicode(page, kb.pageEncoding) return page