Fix for an Issue #268

This commit is contained in:
Miroslav Stampar 2012-12-03 12:13:59 +01:00
parent 8410fc5a9d
commit 79fca8e9d5
3 changed files with 5 additions and 5 deletions

View File

@ -222,10 +222,10 @@ ERROR_PARSING_REGEXES = (
)
# Regular expression used for parsing charset info from meta html headers
META_CHARSET_REGEX = r'<meta http-equiv="?content-type"?[^>]+charset=(?P<result>[^">]+)'
META_CHARSET_REGEX = r'(?si)<head>.*<meta http-equiv="?content-type"?[^>]+charset=(?P<result>[^">]+).*</head>'
# Regular expression used for parsing refresh info from meta html headers
META_REFRESH_REGEX = r'<meta http-equiv="?refresh"?[^>]+content="?[^">]+url=(?P<result>[^">]+)'
META_REFRESH_REGEX = r'(?si)<head>.*<meta http-equiv="?refresh"?[^>]+content="?[^">]+url=(?P<result>[^">]+).*</head>'
# Regular expression used for parsing empty fields in tested form data
EMPTY_FORM_FIELDS_REGEX = r'(&|\A)(?P<result>[^=]+=(&|\Z))'

View File

@ -220,7 +220,7 @@ def decodePage(page, contentEncoding, contentType):
if contentType and (contentType.find("charset=") != -1):
httpCharset = checkCharEncoding(contentType.split("charset=")[-1])
metaCharset = checkCharEncoding(extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE))
metaCharset = checkCharEncoding(extractRegexResult(META_CHARSET_REGEX, page))
if ((httpCharset or metaCharset) and not all([httpCharset, metaCharset]))\
or (httpCharset == metaCharset and all([httpCharset, metaCharset])):

View File

@ -372,8 +372,8 @@ class Connect:
page = decodePage(page, responseHeaders.get(HTTPHEADER.CONTENT_ENCODING), responseHeaders.get(HTTPHEADER.CONTENT_TYPE))
status = getUnicode(conn.msg)
if extractRegexResult(META_REFRESH_REGEX, page, re.DOTALL | re.IGNORECASE) and not refreshing:
url = extractRegexResult(META_REFRESH_REGEX, page, re.DOTALL | re.IGNORECASE)
if extractRegexResult(META_REFRESH_REGEX, page) and not refreshing:
url = extractRegexResult(META_REFRESH_REGEX, page)
debugMsg = "got HTML meta refresh header"
logger.debug(debugMsg)