From 3d4f381ab502712ddd67c1e7b399482b7c8bd7b0 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 9 Jan 2013 15:22:21 +0100 Subject: [PATCH] Patch for an Issue #169 --- lib/core/option.py | 25 +++--- lib/utils/crawler.py | 198 +++++++++++++++++++++---------------------- 2 files changed, 110 insertions(+), 113 deletions(-) diff --git a/lib/core/option.py b/lib/core/option.py index 1221d525c..28c4603d3 100644 --- a/lib/core/option.py +++ b/lib/core/option.py @@ -134,7 +134,7 @@ from lib.request.httpshandler import HTTPSHandler from lib.request.rangehandler import HTTPRangeHandler from lib.request.redirecthandler import SmartRedirectHandler from lib.request.templates import getPageTemplate -from lib.utils.crawler import Crawler +from lib.utils.crawler import crawl from lib.utils.deps import checkDependencies from lib.utils.google import Google from thirdparty.colorama.initialise import init as coloramainit @@ -461,8 +461,7 @@ def _setCrawler(): if not conf.crawlDepth: return - crawler = Crawler() - crawler.getTargetUrls() + crawl(conf.url) def _setGoogleDorking(): """ @@ -570,15 +569,19 @@ def _findPageForms(): if not conf.forms or conf.crawlDepth: return - if not checkConnection(): + if conf.url and not checkConnection(): return infoMsg = "searching for forms" logger.info(infoMsg) - page, _ = Request.queryPage(content=True) - - findPageForms(page, conf.url, True, True) + if not conf.bulkFile: + page, _ = Request.queryPage(content=True) + findPageForms(page, conf.url, True, True) + else: + for target, _, _, _ in kb.targets[:]: + page, _, _= Request.getPage(url=target, crawling=True, raise404=False) + findPageForms(page, target, False, True) def _setDBMSAuthentication(): """ @@ -1961,8 +1964,8 @@ def _basicOptionValidation(): errMsg = "maximum number of used threads is %d avoiding possible connection issues" % MAX_NUMBER_OF_THREADS raise SqlmapSyntaxException(errMsg) - if conf.forms and not conf.url: - errMsg = "switch '--forms' requires usage of option '-u' (--url)" + if conf.forms and not any ((conf.url, conf.bulkFile)): + errMsg = "switch '--forms' requires usage of option '-u' (--url) or '-m'" raise SqlmapSyntaxException(errMsg) if conf.requestFile and conf.url: @@ -2005,8 +2008,8 @@ def _basicOptionValidation(): errMsg = "option '--proxy' is incompatible with switch '--ignore-proxy'" raise SqlmapSyntaxException(errMsg) - if conf.forms and any([conf.logFile, conf.bulkFile, conf.direct, conf.requestFile, conf.googleDork]): - errMsg = "switch '--forms' is compatible only with option '-u' (--url)" + if conf.forms and any([conf.logFile, conf.direct, conf.requestFile, conf.googleDork]): + errMsg = "switch '--forms' is compatible only with options '-u' (--url) and '-m'" raise SqlmapSyntaxException(errMsg) if conf.timeSec < 1: diff --git a/lib/utils/crawler.py b/lib/utils/crawler.py index f19b9fc88..205faf451 100644 --- a/lib/utils/crawler.py +++ b/lib/utils/crawler.py @@ -25,116 +25,110 @@ from lib.request.connect import Connect as Request from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup from thirdparty.oset.pyoset import oset -class Crawler(object): - """ - This class defines methods used to perform crawling (command - line option '--crawl' - """ +def crawl(target): + try: + threadData = getCurrentThreadData() + threadData.shared.value = oset() - def getTargetUrls(self): - try: + def crawlThread(): threadData = getCurrentThreadData() - threadData.shared.value = oset() - def crawlThread(): - threadData = getCurrentThreadData() - - while kb.threadContinue: - with kb.locks.limit: - if threadData.shared.unprocessed: - current = threadData.shared.unprocessed.pop() - else: - break - - content = None - try: - if current: - content = Request.getPage(url=current, crawling=True, raise404=False)[0] - except SqlmapConnectionException, e: - errMsg = "connection exception detected (%s). skipping " % e - errMsg += "url '%s'" % current - logger.critical(errMsg) - except httplib.InvalidURL, e: - errMsg = "invalid url detected (%s). skipping " % e - errMsg += "url '%s'" % current - logger.critical(errMsg) - - if not kb.threadContinue: + while kb.threadContinue: + with kb.locks.limit: + if threadData.shared.unprocessed: + current = threadData.shared.unprocessed.pop() + else: break - if isinstance(content, unicode): - try: - match = re.search(r"(?si)]*>(.+)", content) - if match: - content = "%s" % match.group(1) + content = None + try: + if current: + content = Request.getPage(url=current, crawling=True, raise404=False)[0] + except SqlmapConnectionException, e: + errMsg = "connection exception detected (%s). skipping " % e + errMsg += "url '%s'" % current + logger.critical(errMsg) + except httplib.InvalidURL, e: + errMsg = "invalid url detected (%s). skipping " % e + errMsg += "url '%s'" % current + logger.critical(errMsg) - soup = BeautifulSoup(content) - tags = soup('a') - - if not tags: - tags = re.finditer(r'(?si)]+href="(?P[^>"]+)"', content) - - for tag in tags: - href = tag.get("href") if hasattr(tag, "get") else tag.group("href") - - if href: - url = urlparse.urljoin(conf.url, href) - - # flag to know if we are dealing with the same target host - _ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, conf.url))) - - if conf.scope: - if not re.search(conf.scope, url, re.I): - continue - elif not _: - continue - - if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS: - with kb.locks.value: - threadData.shared.deeper.add(url) - if re.search(r"(.*?)\?(.+)", url): - threadData.shared.value.add(url) - except UnicodeEncodeError: # for non-HTML files - pass - finally: - if conf.forms: - findPageForms(content, current, False, True) - - if conf.verbose in (1, 2): - threadData.shared.count += 1 - status = '%d/%d links visited (%d%s)' % (threadData.shared.count, threadData.shared.length, round(100.0*threadData.shared.count/threadData.shared.length), '%') - dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True) - - threadData.shared.deeper = set() - threadData.shared.unprocessed = set([conf.url]) - - logger.info("starting crawler") - - for i in xrange(conf.crawlDepth): - if i > 0 and conf.threads == 1: - singleTimeWarnMessage("running in a single-thread mode. This could take a while.") - threadData.shared.count = 0 - threadData.shared.length = len(threadData.shared.unprocessed) - numThreads = min(conf.threads, len(threadData.shared.unprocessed)) - logger.info("searching for links with depth %d" % (i + 1)) - runThreads(numThreads, crawlThread) - clearConsoleLine(True) - if threadData.shared.deeper: - threadData.shared.unprocessed = set(threadData.shared.deeper) - else: + if not kb.threadContinue: break - except KeyboardInterrupt: - warnMsg = "user aborted during crawling. sqlmap " - warnMsg += "will use partial list" - logger.warn(warnMsg) + if isinstance(content, unicode): + try: + match = re.search(r"(?si)]*>(.+)", content) + if match: + content = "%s" % match.group(1) - finally: + soup = BeautifulSoup(content) + tags = soup('a') + + if not tags: + tags = re.finditer(r'(?si)]+href="(?P[^>"]+)"', content) + + for tag in tags: + href = tag.get("href") if hasattr(tag, "get") else tag.group("href") + + if href: + url = urlparse.urljoin(target, href) + + # flag to know if we are dealing with the same target host + _ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, target))) + + if conf.scope: + if not re.search(conf.scope, url, re.I): + continue + elif not _: + continue + + if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS: + with kb.locks.value: + threadData.shared.deeper.add(url) + if re.search(r"(.*?)\?(.+)", url): + threadData.shared.value.add(url) + except UnicodeEncodeError: # for non-HTML files + pass + finally: + if conf.forms: + findPageForms(content, current, False, True) + + if conf.verbose in (1, 2): + threadData.shared.count += 1 + status = '%d/%d links visited (%d%s)' % (threadData.shared.count, threadData.shared.length, round(100.0*threadData.shared.count/threadData.shared.length), '%') + dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True) + + threadData.shared.deeper = set() + threadData.shared.unprocessed = set([target]) + + logger.info("starting crawler") + + for i in xrange(conf.crawlDepth): + if i > 0 and conf.threads == 1: + singleTimeWarnMessage("running in a single-thread mode. This could take a while.") + threadData.shared.count = 0 + threadData.shared.length = len(threadData.shared.unprocessed) + numThreads = min(conf.threads, len(threadData.shared.unprocessed)) + logger.info("searching for links with depth %d" % (i + 1)) + runThreads(numThreads, crawlThread) clearConsoleLine(True) - - if not threadData.shared.value: - warnMsg = "no usable links found (with GET parameters)" - logger.warn(warnMsg) + if threadData.shared.deeper: + threadData.shared.unprocessed = set(threadData.shared.deeper) else: - for url in threadData.shared.value: - kb.targets.add(( url, None, None, None )) + break + + except KeyboardInterrupt: + warnMsg = "user aborted during crawling. sqlmap " + warnMsg += "will use partial list" + logger.warn(warnMsg) + + finally: + clearConsoleLine(True) + + if not threadData.shared.value: + warnMsg = "no usable links found (with GET parameters)" + logger.warn(warnMsg) + else: + for url in threadData.shared.value: + kb.targets.add((url, None, None, None))