some charset fix up

2025-07-18 20:22:24 +03:00 · 2010-06-30 12:09:33 +00:00 · 2010-06-30 12:09:33 +00:00 · 0d08903bc3
commit 0d08903bc3
parent 24428c1a1b
1 changed files with 20 additions and 2 deletions
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@ -22,6 +22,7 @@ with sqlmap; if not, write to the Free Software Foundation, Inc., 51
 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 """

+import codecs
 import gzip
 import os
 import re
@ -34,6 +35,7 @@ from lib.core.common import posixToNtSlashes
 from lib.core.common import urlEncodeCookieValues
 from lib.core.data import conf
 from lib.core.data import kb
+from lib.core.data import logger
 from lib.parse.headers import headersParser
 from lib.parse.html import htmlParser

@ -88,6 +90,20 @@ def parseResponse(page, headers):
                if absFilePath not in kb.absFilePaths:
                    kb.absFilePaths.add(absFilePath)

+def checkCharEncoding(encoding):
+    #http://philip.html5.org/data/charsets-2.html
+    if encoding and encoding.startswith('cp-'):
+        encoding = 'cp%s' % encoding[3:]
+    try:
+        codecs.lookup(encoding)
+    except LookupError:
+        warnMsg  = "unknown charset '%s'. " % encoding
+        warnMsg += "please report by e-mail to sqlmap-users@lists.sourceforge.net."
+
+        logger.warn(warnMsg)
+        encoding = conf.dataEncoding
+    return encoding
+
 def decodePage(page, contentEncoding, contentType):
    """
    Decode compressed/charset HTTP response
@ -104,6 +120,8 @@ def decodePage(page, contentEncoding, contentType):
    
    #http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
    if contentType and (contentType.find('charset=') != -1):
-        page = unicode(page, contentType.split('charset=')[-1])     #don't use getUnicode here. it needs to stay as is.
+        charset = checkCharEncoding(contentType.split('charset=')[-1])
+        if charset:
+            page = unicode(page, charset)     #don't use getUnicode here. it needs to stay as is.

    return page