From 3d4f381ab502712ddd67c1e7b399482b7c8bd7b0 Mon Sep 17 00:00:00 2001
From: Miroslav Stampar <miroslav.stampar@gmail.com>
Date: Wed, 9 Jan 2013 15:22:21 +0100
Subject: [PATCH] Patch for an Issue #169

---
 lib/core/option.py   |  25 +++---
 lib/utils/crawler.py | 198 +++++++++++++++++++++----------------------
 2 files changed, 110 insertions(+), 113 deletions(-)

diff --git a/lib/core/option.py b/lib/core/option.py
index 1221d525c..28c4603d3 100644
--- a/lib/core/option.py
+++ b/lib/core/option.py
@@ -134,7 +134,7 @@ from lib.request.httpshandler import HTTPSHandler
 from lib.request.rangehandler import HTTPRangeHandler
 from lib.request.redirecthandler import SmartRedirectHandler
 from lib.request.templates import getPageTemplate
-from lib.utils.crawler import Crawler
+from lib.utils.crawler import crawl
 from lib.utils.deps import checkDependencies
 from lib.utils.google import Google
 from thirdparty.colorama.initialise import init as coloramainit
@@ -461,8 +461,7 @@ def _setCrawler():
     if not conf.crawlDepth:
         return
 
-    crawler = Crawler()
-    crawler.getTargetUrls()
+    crawl(conf.url)
 
 def _setGoogleDorking():
     """
@@ -570,15 +569,19 @@ def _findPageForms():
     if not conf.forms or conf.crawlDepth:
         return
 
-    if not checkConnection():
+    if conf.url and not checkConnection():
         return
 
     infoMsg = "searching for forms"
     logger.info(infoMsg)
 
-    page, _ = Request.queryPage(content=True)
-
-    findPageForms(page, conf.url, True, True)
+    if not conf.bulkFile:
+        page, _ = Request.queryPage(content=True)
+        findPageForms(page, conf.url, True, True)
+    else:
+        for target, _, _, _ in kb.targets[:]:
+            page, _, _= Request.getPage(url=target, crawling=True, raise404=False)
+            findPageForms(page, target, False, True)
 
 def _setDBMSAuthentication():
     """
@@ -1961,8 +1964,8 @@ def _basicOptionValidation():
         errMsg = "maximum number of used threads is %d avoiding possible connection issues" % MAX_NUMBER_OF_THREADS
         raise SqlmapSyntaxException(errMsg)
 
-    if conf.forms and not conf.url:
-        errMsg = "switch '--forms' requires usage of option '-u' (--url)"
+    if conf.forms and not any ((conf.url, conf.bulkFile)):
+        errMsg = "switch '--forms' requires usage of option '-u' (--url) or '-m'"
         raise SqlmapSyntaxException(errMsg)
 
     if conf.requestFile and conf.url:
@@ -2005,8 +2008,8 @@ def _basicOptionValidation():
         errMsg = "option '--proxy' is incompatible with switch '--ignore-proxy'"
         raise SqlmapSyntaxException(errMsg)
 
-    if conf.forms and any([conf.logFile, conf.bulkFile, conf.direct, conf.requestFile, conf.googleDork]):
-        errMsg = "switch '--forms' is compatible only with option '-u' (--url)"
+    if conf.forms and any([conf.logFile, conf.direct, conf.requestFile, conf.googleDork]):
+        errMsg = "switch '--forms' is compatible only with options '-u' (--url) and '-m'"
         raise SqlmapSyntaxException(errMsg)
 
     if conf.timeSec < 1:
diff --git a/lib/utils/crawler.py b/lib/utils/crawler.py
index f19b9fc88..205faf451 100644
--- a/lib/utils/crawler.py
+++ b/lib/utils/crawler.py
@@ -25,116 +25,110 @@ from lib.request.connect import Connect as Request
 from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
 from thirdparty.oset.pyoset import oset
 
-class Crawler(object):
-    """
-    This class defines methods used to perform crawling (command
-    line option '--crawl'
-    """
+def crawl(target):
+    try:
+        threadData = getCurrentThreadData()
+        threadData.shared.value = oset()
 
-    def getTargetUrls(self):
-        try:
+        def crawlThread():
             threadData = getCurrentThreadData()
-            threadData.shared.value = oset()
 
-            def crawlThread():
-                threadData = getCurrentThreadData()
-
-                while kb.threadContinue:
-                    with kb.locks.limit:
-                        if threadData.shared.unprocessed:
-                            current = threadData.shared.unprocessed.pop()
-                        else:
-                            break
-
-                    content = None
-                    try:
-                        if current:
-                            content = Request.getPage(url=current, crawling=True, raise404=False)[0]
-                    except SqlmapConnectionException, e:
-                        errMsg = "connection exception detected (%s). skipping " % e
-                        errMsg += "url '%s'" % current
-                        logger.critical(errMsg)
-                    except httplib.InvalidURL, e:
-                        errMsg = "invalid url detected (%s). skipping " % e
-                        errMsg += "url '%s'" % current
-                        logger.critical(errMsg)
-
-                    if not kb.threadContinue:
+            while kb.threadContinue:
+                with kb.locks.limit:
+                    if threadData.shared.unprocessed:
+                        current = threadData.shared.unprocessed.pop()
+                    else:
                         break
 
-                    if isinstance(content, unicode):
-                        try:
-                            match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
-                            if match:
-                                content = "<html>%s</html>" % match.group(1)
+                content = None
+                try:
+                    if current:
+                        content = Request.getPage(url=current, crawling=True, raise404=False)[0]
+                except SqlmapConnectionException, e:
+                    errMsg = "connection exception detected (%s). skipping " % e
+                    errMsg += "url '%s'" % current
+                    logger.critical(errMsg)
+                except httplib.InvalidURL, e:
+                    errMsg = "invalid url detected (%s). skipping " % e
+                    errMsg += "url '%s'" % current
+                    logger.critical(errMsg)
 
-                            soup = BeautifulSoup(content)
-                            tags = soup('a')
-
-                            if not tags:
-                                tags = re.finditer(r'(?si)<a[^>]+href="(?P<href>[^>"]+)"', content)
-
-                            for tag in tags:
-                                href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
-
-                                if href:
-                                    url = urlparse.urljoin(conf.url, href)
-
-                                    # flag to know if we are dealing with the same target host
-                                    _ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, conf.url)))
-
-                                    if conf.scope:
-                                        if not re.search(conf.scope, url, re.I):
-                                            continue
-                                    elif not _:
-                                        continue
-
-                                    if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
-                                        with kb.locks.value:
-                                            threadData.shared.deeper.add(url)
-                                            if re.search(r"(.*?)\?(.+)", url):
-                                                threadData.shared.value.add(url)
-                        except UnicodeEncodeError: # for non-HTML files
-                            pass
-                        finally:
-                            if conf.forms:
-                                findPageForms(content, current, False, True)
-
-                    if conf.verbose in (1, 2):
-                        threadData.shared.count += 1
-                        status = '%d/%d links visited (%d%s)' % (threadData.shared.count, threadData.shared.length, round(100.0*threadData.shared.count/threadData.shared.length), '%')
-                        dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
-
-            threadData.shared.deeper = set()
-            threadData.shared.unprocessed = set([conf.url])
-
-            logger.info("starting crawler")
-
-            for i in xrange(conf.crawlDepth):
-                if i > 0 and conf.threads == 1:
-                    singleTimeWarnMessage("running in a single-thread mode. This could take a while.")
-                threadData.shared.count = 0
-                threadData.shared.length = len(threadData.shared.unprocessed)
-                numThreads = min(conf.threads, len(threadData.shared.unprocessed))
-                logger.info("searching for links with depth %d" % (i + 1))
-                runThreads(numThreads, crawlThread)
-                clearConsoleLine(True)
-                if threadData.shared.deeper:
-                    threadData.shared.unprocessed = set(threadData.shared.deeper)
-                else:
+                if not kb.threadContinue:
                     break
 
-        except KeyboardInterrupt:
-            warnMsg = "user aborted during crawling. sqlmap "
-            warnMsg += "will use partial list"
-            logger.warn(warnMsg)
+                if isinstance(content, unicode):
+                    try:
+                        match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
+                        if match:
+                            content = "<html>%s</html>" % match.group(1)
 
-        finally:
+                        soup = BeautifulSoup(content)
+                        tags = soup('a')
+
+                        if not tags:
+                            tags = re.finditer(r'(?si)<a[^>]+href="(?P<href>[^>"]+)"', content)
+
+                        for tag in tags:
+                            href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
+
+                            if href:
+                                url = urlparse.urljoin(target, href)
+
+                                # flag to know if we are dealing with the same target host
+                                _ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, target)))
+
+                                if conf.scope:
+                                    if not re.search(conf.scope, url, re.I):
+                                        continue
+                                elif not _:
+                                    continue
+
+                                if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
+                                    with kb.locks.value:
+                                        threadData.shared.deeper.add(url)
+                                        if re.search(r"(.*?)\?(.+)", url):
+                                            threadData.shared.value.add(url)
+                    except UnicodeEncodeError: # for non-HTML files
+                        pass
+                    finally:
+                        if conf.forms:
+                            findPageForms(content, current, False, True)
+
+                if conf.verbose in (1, 2):
+                    threadData.shared.count += 1
+                    status = '%d/%d links visited (%d%s)' % (threadData.shared.count, threadData.shared.length, round(100.0*threadData.shared.count/threadData.shared.length), '%')
+                    dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
+
+        threadData.shared.deeper = set()
+        threadData.shared.unprocessed = set([target])
+
+        logger.info("starting crawler")
+
+        for i in xrange(conf.crawlDepth):
+            if i > 0 and conf.threads == 1:
+                singleTimeWarnMessage("running in a single-thread mode. This could take a while.")
+            threadData.shared.count = 0
+            threadData.shared.length = len(threadData.shared.unprocessed)
+            numThreads = min(conf.threads, len(threadData.shared.unprocessed))
+            logger.info("searching for links with depth %d" % (i + 1))
+            runThreads(numThreads, crawlThread)
             clearConsoleLine(True)
-
-            if not threadData.shared.value:
-                warnMsg = "no usable links found (with GET parameters)"
-                logger.warn(warnMsg)
+            if threadData.shared.deeper:
+                threadData.shared.unprocessed = set(threadData.shared.deeper)
             else:
-                for url in threadData.shared.value:
-                    kb.targets.add(( url, None, None, None ))
+                break
+
+    except KeyboardInterrupt:
+        warnMsg = "user aborted during crawling. sqlmap "
+        warnMsg += "will use partial list"
+        logger.warn(warnMsg)
+
+    finally:
+        clearConsoleLine(True)
+
+        if not threadData.shared.value:
+            warnMsg = "no usable links found (with GET parameters)"
+            logger.warn(warnMsg)
+        else:
+            for url in threadData.shared.value:
+                kb.targets.add((url, None, None, None))