Fix for an Issue #268

This commit is contained in:
Miroslav Stampar 2012-12-03 12:13:59 +01:00
parent 8410fc5a9d
commit 79fca8e9d5
3 changed files with 5 additions and 5 deletions

View File

@ -222,10 +222,10 @@ ERROR_PARSING_REGEXES = (
) )
# Regular expression used for parsing charset info from meta html headers # Regular expression used for parsing charset info from meta html headers
META_CHARSET_REGEX = r'<meta http-equiv="?content-type"?[^>]+charset=(?P<result>[^">]+)' META_CHARSET_REGEX = r'(?si)<head>.*<meta http-equiv="?content-type"?[^>]+charset=(?P<result>[^">]+).*</head>'
# Regular expression used for parsing refresh info from meta html headers # Regular expression used for parsing refresh info from meta html headers
META_REFRESH_REGEX = r'<meta http-equiv="?refresh"?[^>]+content="?[^">]+url=(?P<result>[^">]+)' META_REFRESH_REGEX = r'(?si)<head>.*<meta http-equiv="?refresh"?[^>]+content="?[^">]+url=(?P<result>[^">]+).*</head>'
# Regular expression used for parsing empty fields in tested form data # Regular expression used for parsing empty fields in tested form data
EMPTY_FORM_FIELDS_REGEX = r'(&|\A)(?P<result>[^=]+=(&|\Z))' EMPTY_FORM_FIELDS_REGEX = r'(&|\A)(?P<result>[^=]+=(&|\Z))'

View File

@ -220,7 +220,7 @@ def decodePage(page, contentEncoding, contentType):
if contentType and (contentType.find("charset=") != -1): if contentType and (contentType.find("charset=") != -1):
httpCharset = checkCharEncoding(contentType.split("charset=")[-1]) httpCharset = checkCharEncoding(contentType.split("charset=")[-1])
metaCharset = checkCharEncoding(extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE)) metaCharset = checkCharEncoding(extractRegexResult(META_CHARSET_REGEX, page))
if ((httpCharset or metaCharset) and not all([httpCharset, metaCharset]))\ if ((httpCharset or metaCharset) and not all([httpCharset, metaCharset]))\
or (httpCharset == metaCharset and all([httpCharset, metaCharset])): or (httpCharset == metaCharset and all([httpCharset, metaCharset])):

View File

@ -372,8 +372,8 @@ class Connect:
page = decodePage(page, responseHeaders.get(HTTPHEADER.CONTENT_ENCODING), responseHeaders.get(HTTPHEADER.CONTENT_TYPE)) page = decodePage(page, responseHeaders.get(HTTPHEADER.CONTENT_ENCODING), responseHeaders.get(HTTPHEADER.CONTENT_TYPE))
status = getUnicode(conn.msg) status = getUnicode(conn.msg)
if extractRegexResult(META_REFRESH_REGEX, page, re.DOTALL | re.IGNORECASE) and not refreshing: if extractRegexResult(META_REFRESH_REGEX, page) and not refreshing:
url = extractRegexResult(META_REFRESH_REGEX, page, re.DOTALL | re.IGNORECASE) url = extractRegexResult(META_REFRESH_REGEX, page)
debugMsg = "got HTML meta refresh header" debugMsg = "got HTML meta refresh header"
logger.debug(debugMsg) logger.debug(debugMsg)