implementation of a feature suggested by pan@knownsec.com (usage of charset type from http-equiv attribute in case when charset is not defined in headers)

This commit is contained in:
Miroslav Stampar 2011-01-04 15:49:20 +00:00
parent 8a48baf789
commit aa81ed4033
3 changed files with 14 additions and 5 deletions

View File

@ -267,8 +267,8 @@ Simone Onofri <simone.onofri@gmail.com>
Windows Windows
Shaohua Pan <pan@knownsec.com> Shaohua Pan <pan@knownsec.com>
for reporting few bugs for reporting several bugs
for suggesting a feature for suggesting a few features
Antonio Parata <s4tan@ictsc.it> Antonio Parata <s4tan@ictsc.it>
for providing me with some ideas for the PHP backdoor for providing me with some ideas for the PHP backdoor

View File

@ -169,3 +169,5 @@ ERROR_PARSING_REGEXES = (
r"<li>Error Type:<br>(?P<result>.+?)</li>", r"<li>Error Type:<br>(?P<result>.+?)</li>",
r"error '[0-9a-f]{8}'((<[^>]+>)|\s)+(?P<result>[^<>]+)" r"error '[0-9a-f]{8}'((<[^>]+>)|\s)+(?P<result>[^<>]+)"
) )
META_CHARSET_REGEX = r'<meta http-equiv="Content-Type" content="[^"]*?charset=(?P<result>[^"]+)" />'

View File

@ -15,6 +15,7 @@ import StringIO
import zlib import zlib
from lib.core.common import extractErrorMessage from lib.core.common import extractErrorMessage
from lib.core.common import extractRegexResult
from lib.core.common import getCompiledRegex from lib.core.common import getCompiledRegex
from lib.core.common import getUnicode from lib.core.common import getUnicode
from lib.core.common import isWindowsDriveLetterPath from lib.core.common import isWindowsDriveLetterPath
@ -23,6 +24,7 @@ from lib.core.common import sanitizeAsciiString
from lib.core.data import conf from lib.core.data import conf
from lib.core.data import kb from lib.core.data import kb
from lib.core.data import logger from lib.core.data import logger
from lib.core.settings import META_CHARSET_REGEX
from lib.parse.headers import headersParser from lib.parse.headers import headersParser
from lib.parse.html import htmlParser from lib.parse.html import htmlParser
@ -127,12 +129,17 @@ def decodePage(page, contentEncoding, contentType):
page = data.read() page = data.read()
charset = None
# http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode # http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
if contentType and (contentType.find('charset=') != -1): if contentType and (contentType.find('charset=') != -1):
charset = checkCharEncoding(contentType.split('charset=')[-1]) charset = contentType.split('charset=')[-1]
elif extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE):
charset = extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE)
if charset: charset = checkCharEncoding(charset)
kb.pageEncoding = charset if charset:
kb.pageEncoding = charset
return getUnicode(page) return getUnicode(page)