implementation of a feature suggested by pan@knownsec.com (usage of charset type from http-equiv attribute in case when charset is not defined in headers)

2025-07-22 22:19:46 +03:00 · 2011-01-04 15:49:20 +00:00 · 2011-01-04 15:49:20 +00:00 · aa81ed4033
commit aa81ed4033
parent 8a48baf789
3 changed files with 14 additions and 5 deletions
--- a/doc/THANKS
+++ b/doc/THANKS
@ -267,8 +267,8 @@ Simone Onofri <simone.onofri@gmail.com>
    Windows

 Shaohua Pan <pan@knownsec.com>
-    for reporting few bugs
-    for suggesting a feature
+    for reporting several bugs
+    for suggesting a few features

 Antonio Parata <s4tan@ictsc.it>
    for providing me with some ideas for the PHP backdoor
--- a/lib/core/settings.py
+++ b/lib/core/settings.py
@ -169,3 +169,5 @@ ERROR_PARSING_REGEXES = (
                            r"<li>Error Type:<br>(?P<result>.+?)</li>", 
                            r"error '[0-9a-f]{8}'((<[^>]+>)|\s)+(?P<result>[^<>]+)"
                        )
+
+META_CHARSET_REGEX  = r'<meta http-equiv="Content-Type" content="[^"]*?charset=(?P<result>[^"]+)" />'
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@ -15,6 +15,7 @@ import StringIO
 import zlib

 from lib.core.common import extractErrorMessage
+from lib.core.common import extractRegexResult
 from lib.core.common import getCompiledRegex
 from lib.core.common import getUnicode
 from lib.core.common import isWindowsDriveLetterPath
@ -23,6 +24,7 @@ from lib.core.common import sanitizeAsciiString
 from lib.core.data import conf
 from lib.core.data import kb
 from lib.core.data import logger
+from lib.core.settings import META_CHARSET_REGEX
 from lib.parse.headers import headersParser
 from lib.parse.html import htmlParser

@ -127,12 +129,17 @@ def decodePage(page, contentEncoding, contentType):

        page = data.read()

+    charset = None
+
    # http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
    if contentType and (contentType.find('charset=') != -1):
-        charset = checkCharEncoding(contentType.split('charset=')[-1])
+        charset = contentType.split('charset=')[-1]
+    elif extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE):
+        charset = extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE)

-        if charset:
-            kb.pageEncoding = charset
+    charset = checkCharEncoding(charset)
+    if charset:
+        kb.pageEncoding = charset

    return getUnicode(page)