implementation of a feature suggested by pan@knownsec.com (usage of charset type from http-equiv attribute in case when charset is not defined in headers)

This commit is contained in:
Miroslav Stampar 2011-01-04 15:49:20 +00:00
parent 8a48baf789
commit aa81ed4033
3 changed files with 14 additions and 5 deletions

View File

@ -267,8 +267,8 @@ Simone Onofri <simone.onofri@gmail.com>
Windows
Shaohua Pan <pan@knownsec.com>
for reporting few bugs
for suggesting a feature
for reporting several bugs
for suggesting a few features
Antonio Parata <s4tan@ictsc.it>
for providing me with some ideas for the PHP backdoor

View File

@ -169,3 +169,5 @@ ERROR_PARSING_REGEXES = (
r"<li>Error Type:<br>(?P<result>.+?)</li>",
r"error '[0-9a-f]{8}'((<[^>]+>)|\s)+(?P<result>[^<>]+)"
)
META_CHARSET_REGEX = r'<meta http-equiv="Content-Type" content="[^"]*?charset=(?P<result>[^"]+)" />'

View File

@ -15,6 +15,7 @@ import StringIO
import zlib
from lib.core.common import extractErrorMessage
from lib.core.common import extractRegexResult
from lib.core.common import getCompiledRegex
from lib.core.common import getUnicode
from lib.core.common import isWindowsDriveLetterPath
@ -23,6 +24,7 @@ from lib.core.common import sanitizeAsciiString
from lib.core.data import conf
from lib.core.data import kb
from lib.core.data import logger
from lib.core.settings import META_CHARSET_REGEX
from lib.parse.headers import headersParser
from lib.parse.html import htmlParser
@ -127,10 +129,15 @@ def decodePage(page, contentEncoding, contentType):
page = data.read()
charset = None
# http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
if contentType and (contentType.find('charset=') != -1):
charset = checkCharEncoding(contentType.split('charset=')[-1])
charset = contentType.split('charset=')[-1]
elif extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE):
charset = extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE)
charset = checkCharEncoding(charset)
if charset:
kb.pageEncoding = charset