major bug fix (different HTTP content charsets are now properly handled)

2025-10-18 09:44:28 +03:00 · 2010-06-09 14:40:36 +00:00 · 2010-06-09 14:40:36 +00:00 · eaef068c90
commit eaef068c90
parent 654d707d5d
2 changed files with 18 additions and 16 deletions
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@ -89,13 +89,13 @@ def parseResponse(page, headers):
                    kb.absFilePaths.add(absFilePath)
-def decodePage(page, encoding):
+def decodePage(page, contentEncoding, contentType):
    """
-    Decode gzip/deflate HTTP response
+    Decode compressed/charset HTTP response
    """
-    if isinstance(encoding, basestring) and encoding.lower() in ('gzip', 'x-gzip', 'deflate'):
+    if isinstance(contentEncoding, basestring) and contentEncoding.lower() in ('gzip', 'x-gzip', 'deflate'):
-        if encoding == 'deflate':
+        if contentEncoding == 'deflate':
            # http://stackoverflow.com/questions/1089662/python-inflate-and-deflate-implementations
            data = StringIO.StringIO(zlib.decompress(page, -15))
        else:
@ -103,4 +103,8 @@ def decodePage(page, encoding):
        page = data.read()
    #http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
    if contentType and (contentType.find('charset=') != -1):
        page = unicode(page, contentType.split('charset=')[-1])
    return page
--- a/lib/request/connect.py
+++ b/lib/request/connect.py
@ -88,33 +88,32 @@ class Connect:
        try:
            if silent:
                socket.setdefaulttimeout(3)
-    
+
            if direct:
                if "?" in url:
                    url, params = url.split("?")
                    params = urlencode(params)
                    url = "%s?%s" % (url, params)
                    requestMsg += "?%s" % params
-    
+
            elif multipart:
                # Needed in this form because of potential circle dependency 
                # problem (option -> update -> connect -> option)
                from lib.core.option import proxyHandler
-                
+
                multipartOpener = urllib2.build_opener(proxyHandler, multipartpost.MultipartPostHandler)
                conn = multipartOpener.open(url, multipart)
-                page = conn.read()            
+                page = conn.read()
                responseHeaders = conn.info()
-    
+
-                encoding = responseHeaders.get("Content-Encoding")
+                page = decodePage(page, responseHeaders.get("Content-Encoding"), responseHeaders.get("Content-Type"))
-                page = decodePage(page, encoding)
+
                return page
-    
+
            else:
                if conf.parameters.has_key("GET") and not get:
                    get = conf.parameters["GET"]
-    
+
                if get:
                    get = urlencode(get)
                    url = "%s?%s" % (url, get)
@ -190,8 +189,7 @@ class Connect:
            status          = conn.msg
            responseHeaders = conn.info()
-            encoding = responseHeaders.get("Content-Encoding")
+            page = decodePage(page, responseHeaders.get("Content-Encoding"), responseHeaders.get("Content-Type"))
            page = decodePage(page, encoding)
        except urllib2.HTTPError, e:
            if e.code == 401: