adding Beautifulsoup (BSD) into extras; adding --crawl to options

2025-12-04 00:34:20 +03:00 · 2011-06-20 11:32:30 +00:00 · 2011-06-20 11:32:30 +00:00 · 07e2c72943
commit 07e2c72943
parent 8c04aa871a
9 changed files with 2168 additions and 3 deletions
--- a/extra/beautifulsoup/init.py
+++ b/extra/beautifulsoup/init.py
@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# 
+# Copyright (c) 2004-2010, Leonard Richardson
+# 
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 
+#   * Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+# 
+#   * Redistributions in binary form must reproduce the above
+#     copyright notice, this list of conditions and the following
+#     disclaimer in the documentation and/or other materials provided
+#     with the distribution.
+# 
+#   * Neither the name of the the Beautiful Soup Consortium and All
+#     Night Kosher Bakery nor the names of its contributors may be
+#     used to endorse or promote products derived from this software
+#     without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
+# 
+
+pass
--- a/extra/beautifulsoup/beautifulsoup.py
+++ b/extra/beautifulsoup/beautifulsoup.py
--- a/lib/controller/controller.py
+++ b/lib/controller/controller.py
@ -589,7 +589,7 @@ def start():
    if kb.dataOutputFlag and not conf.multipleTargets:
        logger.info("Fetched data logged to text files under '%s'" % conf.outputPath)

-    if conf.multipleTargets:
+    if conf.multipleTargets and conf.resultsFilename:
        infoMsg  = "you can find results of scanning in multiple targets "
        infoMsg += "mode inside the CSV file '%s'" % conf.resultsFilename
        logger.info(infoMsg)
--- a/lib/core/option.py
+++ b/lib/core/option.py
@ -114,6 +114,7 @@ from lib.request.certhandler import HTTPSCertAuthHandler
 from lib.request.rangehandler import HTTPRangeHandler
 from lib.request.redirecthandler import SmartRedirectHandler
 from lib.request.templates import getPageTemplate
+from lib.utils.crawler import Crawler
 from lib.utils.deps import checkDependencies
 from lib.utils.google import Google

@ -388,6 +389,13 @@ def __setRequestFromFile():

    __feedTargetsDict(conf.requestFile, addedTargetUrls)

+def __setCrawler():
+    if not conf.crawl:
+        return
+
+    crawler = Crawler()
+    crawler.getTargetUrls()
+
 def __setGoogleDorking():
    """
    This function checks if the way to request testable hosts is through
@ -1278,7 +1286,7 @@ def __cleanupOptions():
    if conf.tmpPath:
        conf.tmpPath = ntToPosixSlashes(normalizePath(conf.tmpPath))

-    if conf.googleDork or conf.logFile or conf.bulkFile or conf.forms:
+    if conf.googleDork or conf.logFile or conf.bulkFile or conf.forms or conf.crawl:
        conf.multipleTargets = True

    if conf.optimize:
@ -1800,6 +1808,7 @@ def init(inputOptions=advancedDict(), overrideOptions=False):
        __setDNSCache()
        __setSafeUrl()
        __setGoogleDorking()
+        __setCrawler()
        __setBulkMultipleTargets()
        __urllib2Opener()
        __findPageForms()
--- a/lib/core/optiondict.py
+++ b/lib/core/optiondict.py
@ -167,6 +167,8 @@ optDict = {
                               "beep":              "boolean",
                               "checkPayload":      "boolean",
                               "cleanup":           "boolean",
+                               "crawl":             "boolean",
+                               "forms":             "boolean",
                               "googlePage":        "integer",
                               "mobile":            "boolean",
                               "pageRank":          "boolean",
--- a/lib/parse/cmdline.py
+++ b/lib/parse/cmdline.py
@ -511,6 +511,10 @@ def cmdLineParser():
                                  help="Clean up the DBMS by sqlmap specific "
                                  "UDF and tables")

+        miscellaneous.add_option("--crawl", dest="crawl",
+                                  action="store_true",
+                                  help="Crawl the website starting from the target url")
+
        miscellaneous.add_option("--forms", dest="forms",
                                  action="store_true",
                                  help="Parse and test forms on target url")
--- a/lib/utils/crawler.py
+++ b/lib/utils/crawler.py
@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+"""
+$Id$
+
+Copyright (c) 2006-2011 sqlmap developers (http://sqlmap.sourceforge.net/)
+See the file 'doc/COPYING' for copying permission
+"""
+
+import re
+import threading
+import urlparse
+
+from lib.core.common import dataToStdout
+from lib.core.data import conf
+from lib.core.data import kb
+from lib.core.data import logger
+from lib.core.exception import sqlmapConnectionException
+from lib.core.threads import getCurrentThreadData
+from lib.core.threads import runThreads
+from lib.request.connect import Connect as Request
+from extra.beautifulsoup.beautifulsoup import BeautifulSoup
+from extra.oset.pyoset import oset
+
+class Crawler:
+    """
+    This class defines methods used to perform crawling (command
+    line option '--crawl'
+    """
+
+    def getTargetUrls(self, depth=1):
+        try:
+            threadData = getCurrentThreadData()
+            threadData.shared.outputs = oset()
+
+            lockNames = ('limits', 'outputs')
+            for lock in lockNames:
+                kb.locks[lock] = threading.Lock()
+
+            def crawlThread():
+                threadData = getCurrentThreadData()
+
+                while kb.threadContinue:
+                    kb.locks.limits.acquire()
+                    if threadData.shared.unprocessed:
+                        current = threadData.shared.unprocessed.pop()
+                        kb.locks.limits.release()
+                    else:
+                        kb.locks.limits.release()
+                        break
+
+                    content = Request.getPage(url=conf.url)[0]
+
+                    if not kb.threadContinue:
+                        break
+
+                    soup = BeautifulSoup(content)
+                    for tag in soup('a'):
+                        if tag.get("href"):
+                            url = urlparse.urljoin(conf.url, tag.get("href"))
+                            # flag to know if we are dealing with the same target host
+                            target = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], [url, conf.url]))
+                            if target:
+                                kb.locks.outputs.acquire()
+                                threadData.shared.deeper.add(url)
+                                if re.search(r"(.*?)\?(.+)", url):
+                                    threadData.shared.outputs.add(url)
+                                kb.locks.outputs.release()
+
+            threadData.shared.deeper = set()
+            threadData.shared.unprocessed = set([conf.url])
+
+            logger.info("starting crawling")
+
+            for i in xrange(depth):
+                numThreads = min(conf.threads, len(threadData.shared.unprocessed))
+                logger.debug("processing depth: %d" % i)
+                runThreads(numThreads, crawlThread)
+                threadData.shared.unprocessed = threadData.shared.deeper
+
+        except KeyboardInterrupt:
+            warnMsg = "user aborted during crawling. sqlmap "
+            warnMsg += "will use partial list"
+            logger.warn(warnMsg)
+
+        except sqlmapConnectionException, e:
+            errMsg = "connection exception detected. sqlmap "
+            errMsg += "will use partial list"
+            errMsg += "'%s'" % e
+            logger.critical(errMsg)
+
+        finally:
+            for url in threadData.shared.outputs:
+                kb.targetUrls.add(( url, None, None, None ))
+            kb.suppressResumeInfo = False
--- a/lib/utils/google.py
+++ b/lib/utils/google.py
@ -60,7 +60,7 @@ class Google:
        """

        for match in self.__matches:
-            if re.search(r"(.*?)\?(.+)", match, re.I):
+            if re.search(r"(.*?)\?(.+)", match):
                kb.targetUrls.add(( htmlunescape(htmlunescape(match)), None, None, None ))
            elif re.search(URI_INJECTABLE_REGEX, match, re.I):
                kb.targetUrls.add(( htmlunescape(htmlunescape("%s" % match)), None, None, None ))
--- a/sqlmap.conf
+++ b/sqlmap.conf
@ -543,6 +543,10 @@ checkPayload = False
 # Valid: True or False
 cleanup = False

+# Crawl the website starting from the target url
+# Valid: True or False
+crawl = False
+
 # Parse and test forms on target url
 # Valid: True or False
 forms = False