major bug fix (different HTTP content charsets are now properly handled)

This commit is contained in:
Miroslav Stampar 2010-06-09 14:40:36 +00:00
parent 654d707d5d
commit eaef068c90
2 changed files with 18 additions and 16 deletions

View File

@ -89,13 +89,13 @@ def parseResponse(page, headers):
kb.absFilePaths.add(absFilePath) kb.absFilePaths.add(absFilePath)
def decodePage(page, encoding): def decodePage(page, contentEncoding, contentType):
""" """
Decode gzip/deflate HTTP response Decode compressed/charset HTTP response
""" """
if isinstance(encoding, basestring) and encoding.lower() in ('gzip', 'x-gzip', 'deflate'): if isinstance(contentEncoding, basestring) and contentEncoding.lower() in ('gzip', 'x-gzip', 'deflate'):
if encoding == 'deflate': if contentEncoding == 'deflate':
# http://stackoverflow.com/questions/1089662/python-inflate-and-deflate-implementations # http://stackoverflow.com/questions/1089662/python-inflate-and-deflate-implementations
data = StringIO.StringIO(zlib.decompress(page, -15)) data = StringIO.StringIO(zlib.decompress(page, -15))
else: else:
@ -103,4 +103,8 @@ def decodePage(page, encoding):
page = data.read() page = data.read()
#http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
if contentType and (contentType.find('charset=') != -1):
page = unicode(page, contentType.split('charset=')[-1])
return page return page

View File

@ -88,33 +88,32 @@ class Connect:
try: try:
if silent: if silent:
socket.setdefaulttimeout(3) socket.setdefaulttimeout(3)
if direct: if direct:
if "?" in url: if "?" in url:
url, params = url.split("?") url, params = url.split("?")
params = urlencode(params) params = urlencode(params)
url = "%s?%s" % (url, params) url = "%s?%s" % (url, params)
requestMsg += "?%s" % params requestMsg += "?%s" % params
elif multipart: elif multipart:
# Needed in this form because of potential circle dependency # Needed in this form because of potential circle dependency
# problem (option -> update -> connect -> option) # problem (option -> update -> connect -> option)
from lib.core.option import proxyHandler from lib.core.option import proxyHandler
multipartOpener = urllib2.build_opener(proxyHandler, multipartpost.MultipartPostHandler) multipartOpener = urllib2.build_opener(proxyHandler, multipartpost.MultipartPostHandler)
conn = multipartOpener.open(url, multipart) conn = multipartOpener.open(url, multipart)
page = conn.read() page = conn.read()
responseHeaders = conn.info() responseHeaders = conn.info()
encoding = responseHeaders.get("Content-Encoding") page = decodePage(page, responseHeaders.get("Content-Encoding"), responseHeaders.get("Content-Type"))
page = decodePage(page, encoding)
return page return page
else: else:
if conf.parameters.has_key("GET") and not get: if conf.parameters.has_key("GET") and not get:
get = conf.parameters["GET"] get = conf.parameters["GET"]
if get: if get:
get = urlencode(get) get = urlencode(get)
url = "%s?%s" % (url, get) url = "%s?%s" % (url, get)
@ -190,8 +189,7 @@ class Connect:
status = conn.msg status = conn.msg
responseHeaders = conn.info() responseHeaders = conn.info()
encoding = responseHeaders.get("Content-Encoding") page = decodePage(page, responseHeaders.get("Content-Encoding"), responseHeaders.get("Content-Type"))
page = decodePage(page, encoding)
except urllib2.HTTPError, e: except urllib2.HTTPError, e:
if e.code == 401: if e.code == 401: