some charset fix up

This commit is contained in:
Miroslav Stampar 2010-06-30 12:09:33 +00:00
parent 24428c1a1b
commit 0d08903bc3

View File

@ -22,6 +22,7 @@ with sqlmap; if not, write to the Free Software Foundation, Inc., 51
Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
""" """
import codecs
import gzip import gzip
import os import os
import re import re
@ -34,6 +35,7 @@ from lib.core.common import posixToNtSlashes
from lib.core.common import urlEncodeCookieValues from lib.core.common import urlEncodeCookieValues
from lib.core.data import conf from lib.core.data import conf
from lib.core.data import kb from lib.core.data import kb
from lib.core.data import logger
from lib.parse.headers import headersParser from lib.parse.headers import headersParser
from lib.parse.html import htmlParser from lib.parse.html import htmlParser
@ -88,6 +90,20 @@ def parseResponse(page, headers):
if absFilePath not in kb.absFilePaths: if absFilePath not in kb.absFilePaths:
kb.absFilePaths.add(absFilePath) kb.absFilePaths.add(absFilePath)
def checkCharEncoding(encoding):
#http://philip.html5.org/data/charsets-2.html
if encoding and encoding.startswith('cp-'):
encoding = 'cp%s' % encoding[3:]
try:
codecs.lookup(encoding)
except LookupError:
warnMsg = "unknown charset '%s'. " % encoding
warnMsg += "please report by e-mail to sqlmap-users@lists.sourceforge.net."
logger.warn(warnMsg)
encoding = conf.dataEncoding
return encoding
def decodePage(page, contentEncoding, contentType): def decodePage(page, contentEncoding, contentType):
""" """
Decode compressed/charset HTTP response Decode compressed/charset HTTP response
@ -104,6 +120,8 @@ def decodePage(page, contentEncoding, contentType):
#http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode #http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
if contentType and (contentType.find('charset=') != -1): if contentType and (contentType.find('charset=') != -1):
page = unicode(page, contentType.split('charset=')[-1]) #don't use getUnicode here. it needs to stay as is. charset = checkCharEncoding(contentType.split('charset=')[-1])
if charset:
page = unicode(page, charset) #don't use getUnicode here. it needs to stay as is.
return page return page