Fix for crawler and redirection case

This commit is contained in:
stamparm 2013-04-30 18:08:26 +02:00
parent 09e7f4f697
commit ebe8ee3500
3 changed files with 5 additions and 1 deletions

View File

@ -49,6 +49,7 @@ class _ThreadData(threading.local):
self.lastQueryDuration = 0
self.lastRequestMsg = None
self.lastRequestUID = 0
self.lastRedirectURL = None
self.resumed = False
self.retriesCount = 0
self.seqMatcher = difflib.SequenceMatcher(None)

View File

@ -117,9 +117,10 @@ class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
else:
result = fp
threadData.lastRedirectURL = (threadData.lastRequestUID, redurl)
result.redcode = code
result.redurl = redurl
return result
http_error_301 = http_error_303 = http_error_307 = http_error_302

View File

@ -72,6 +72,8 @@ def crawl(target):
href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
if href:
if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
current = threadData.lastRedirectURL[1]
url = urlparse.urljoin(current, href)
# flag to know if we are dealing with the same target host