minor bug fix

This commit is contained in:
Miroslav Stampar 2011-12-20 10:34:28 +00:00
parent 1b16b5e0f1
commit bdc724cb46
2 changed files with 28 additions and 22 deletions

View File

@ -106,6 +106,9 @@ Alessandro Curio <alessandro.curio@gmail.com>
Alessio Dalla Piazza <alessio.dallapiazza@gmail.com> Alessio Dalla Piazza <alessio.dallapiazza@gmail.com>
for reporting a couple of bugs for reporting a couple of bugs
Sherif El-Deeb <archeldeeb@gmail.com>
for reporting a minor bug
Stefano Di Paola <stefano.dipaola@wisec.it> Stefano Di Paola <stefano.dipaola@wisec.it>
for suggesting good features for suggesting good features

View File

@ -72,29 +72,32 @@ class Crawler:
break break
if isinstance(content, unicode): if isinstance(content, unicode):
soup = BeautifulSoup(content) try:
for tag in soup('a'): soup = BeautifulSoup(content)
if tag.get("href"): for tag in soup('a'):
url = urlparse.urljoin(conf.url, tag.get("href")) if tag.get("href"):
url = urlparse.urljoin(conf.url, tag.get("href"))
# flag to know if we are dealing with the same target host
target = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], [url, conf.url])) # flag to know if we are dealing with the same target host
target = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], [url, conf.url]))
if conf.scope:
if not re.search(conf.scope, url, re.I): if conf.scope:
if not re.search(conf.scope, url, re.I):
continue
elif not target:
continue continue
elif not target:
continue if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
kb.locks.outputs.acquire()
if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS: threadData.shared.deeper.add(url)
kb.locks.outputs.acquire() if re.search(r"(.*?)\?(.+)", url):
threadData.shared.deeper.add(url) threadData.shared.outputs.add(url)
if re.search(r"(.*?)\?(.+)", url): kb.locks.outputs.release()
threadData.shared.outputs.add(url) except UnicodeEncodeError: # for non-HTML files
kb.locks.outputs.release() pass
finally:
if conf.forms: if conf.forms:
findPageForms(content, current, False, True) findPageForms(content, current, False, True)
if conf.verbose in (1, 2): if conf.verbose in (1, 2):
kb.locks.ioLock.acquire() kb.locks.ioLock.acquire()