adding Beautifulsoup (BSD) into extras; adding --crawl to options

2025-12-22 09:34:19 +03:00 · 2011-06-20 11:32:30 +00:00 · 2011-06-20 11:32:30 +00:00 · 07e2c72943
commit 07e2c72943
parent 8c04aa871a
9 changed files with 2168 additions and 3 deletions
--- a/extra/beautifulsoup/init.py
+++ b/extra/beautifulsoup/init.py
@ -0,0 +1,37 @@
 #!/usr/bin/env python
 # 
 # Copyright (c) 2004-2010, Leonard Richardson
 # 
 # All rights reserved.
 # 
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 # 
 #   * Redistributions of source code must retain the above copyright
 #     notice, this list of conditions and the following disclaimer.
 # 
 #   * Redistributions in binary form must reproduce the above
 #     copyright notice, this list of conditions and the following
 #     disclaimer in the documentation and/or other materials provided
 #     with the distribution.
 # 
 #   * Neither the name of the the Beautiful Soup Consortium and All
 #     Night Kosher Bakery nor the names of its contributors may be
 #     used to endorse or promote products derived from this software
 #     without specific prior written permission.
 # 
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
 # 
 pass
--- a/extra/beautifulsoup/beautifulsoup.py
+++ b/extra/beautifulsoup/beautifulsoup.py
--- a/lib/controller/controller.py
+++ b/lib/controller/controller.py
@ -589,7 +589,7 @@ def start():
    if kb.dataOutputFlag and not conf.multipleTargets:
        logger.info("Fetched data logged to text files under '%s'" % conf.outputPath)
-    if conf.multipleTargets:
+    if conf.multipleTargets and conf.resultsFilename:
        infoMsg  = "you can find results of scanning in multiple targets "
        infoMsg += "mode inside the CSV file '%s'" % conf.resultsFilename
        logger.info(infoMsg)
--- a/lib/core/option.py
+++ b/lib/core/option.py
@ -114,6 +114,7 @@ from lib.request.certhandler import HTTPSCertAuthHandler
 from lib.request.rangehandler import HTTPRangeHandler
 from lib.request.redirecthandler import SmartRedirectHandler
 from lib.request.templates import getPageTemplate
 from lib.utils.crawler import Crawler
 from lib.utils.deps import checkDependencies
 from lib.utils.google import Google
@ -388,6 +389,13 @@ def __setRequestFromFile():
    __feedTargetsDict(conf.requestFile, addedTargetUrls)
 def __setCrawler():
    if not conf.crawl:
        return
    crawler = Crawler()
    crawler.getTargetUrls()
 def __setGoogleDorking():
    """
    This function checks if the way to request testable hosts is through
@ -1278,7 +1286,7 @@ def __cleanupOptions():
    if conf.tmpPath:
        conf.tmpPath = ntToPosixSlashes(normalizePath(conf.tmpPath))
-    if conf.googleDork or conf.logFile or conf.bulkFile or conf.forms:
+    if conf.googleDork or conf.logFile or conf.bulkFile or conf.forms or conf.crawl:
        conf.multipleTargets = True
    if conf.optimize:
@ -1800,6 +1808,7 @@ def init(inputOptions=advancedDict(), overrideOptions=False):
        __setDNSCache()
        __setSafeUrl()
        __setGoogleDorking()
        __setCrawler()
        __setBulkMultipleTargets()
        __urllib2Opener()
        __findPageForms()
--- a/lib/core/optiondict.py
+++ b/lib/core/optiondict.py
@ -167,6 +167,8 @@ optDict = {
                               "beep":              "boolean",
                               "checkPayload":      "boolean",
                               "cleanup":           "boolean",
                               "crawl":             "boolean",
                               "forms":             "boolean",
                               "googlePage":        "integer",
                               "mobile":            "boolean",
                               "pageRank":          "boolean",
--- a/lib/parse/cmdline.py
+++ b/lib/parse/cmdline.py
@ -511,6 +511,10 @@ def cmdLineParser():
                                  help="Clean up the DBMS by sqlmap specific "
                                  "UDF and tables")
        miscellaneous.add_option("--crawl", dest="crawl",
                                  action="store_true",
                                  help="Crawl the website starting from the target url")
        miscellaneous.add_option("--forms", dest="forms",
                                  action="store_true",
                                  help="Parse and test forms on target url")
--- a/lib/utils/crawler.py
+++ b/lib/utils/crawler.py
@ -0,0 +1,95 @@
 #!/usr/bin/env python
 """
 $Id$
 Copyright (c) 2006-2011 sqlmap developers (http://sqlmap.sourceforge.net/)
 See the file 'doc/COPYING' for copying permission
 """
 import re
 import threading
 import urlparse
 from lib.core.common import dataToStdout
 from lib.core.data import conf
 from lib.core.data import kb
 from lib.core.data import logger
 from lib.core.exception import sqlmapConnectionException
 from lib.core.threads import getCurrentThreadData
 from lib.core.threads import runThreads
 from lib.request.connect import Connect as Request
 from extra.beautifulsoup.beautifulsoup import BeautifulSoup
 from extra.oset.pyoset import oset
 class Crawler:
    """
    This class defines methods used to perform crawling (command
    line option '--crawl'
    """
    def getTargetUrls(self, depth=1):
        try:
            threadData = getCurrentThreadData()
            threadData.shared.outputs = oset()
            lockNames = ('limits', 'outputs')
            for lock in lockNames:
                kb.locks[lock] = threading.Lock()
            def crawlThread():
                threadData = getCurrentThreadData()
                while kb.threadContinue:
                    kb.locks.limits.acquire()
                    if threadData.shared.unprocessed:
                        current = threadData.shared.unprocessed.pop()
                        kb.locks.limits.release()
                    else:
                        kb.locks.limits.release()
                        break
                    content = Request.getPage(url=conf.url)[0]
                    if not kb.threadContinue:
                        break
                    soup = BeautifulSoup(content)
                    for tag in soup('a'):
                        if tag.get("href"):
                            url = urlparse.urljoin(conf.url, tag.get("href"))
                            # flag to know if we are dealing with the same target host
                            target = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], [url, conf.url]))
                            if target:
                                kb.locks.outputs.acquire()
                                threadData.shared.deeper.add(url)
                                if re.search(r"(.*?)\?(.+)", url):
                                    threadData.shared.outputs.add(url)
                                kb.locks.outputs.release()
            threadData.shared.deeper = set()
            threadData.shared.unprocessed = set([conf.url])
            logger.info("starting crawling")
            for i in xrange(depth):
                numThreads = min(conf.threads, len(threadData.shared.unprocessed))
                logger.debug("processing depth: %d" % i)
                runThreads(numThreads, crawlThread)
                threadData.shared.unprocessed = threadData.shared.deeper
        except KeyboardInterrupt:
            warnMsg = "user aborted during crawling. sqlmap "
            warnMsg += "will use partial list"
            logger.warn(warnMsg)
        except sqlmapConnectionException, e:
            errMsg = "connection exception detected. sqlmap "
            errMsg += "will use partial list"
            errMsg += "'%s'" % e
            logger.critical(errMsg)
        finally:
            for url in threadData.shared.outputs:
                kb.targetUrls.add(( url, None, None, None ))
            kb.suppressResumeInfo = False
--- a/lib/utils/google.py
+++ b/lib/utils/google.py
@ -60,7 +60,7 @@ class Google:
        """
        for match in self.__matches:
-            if re.search(r"(.*?)\?(.+)", match, re.I):
+            if re.search(r"(.*?)\?(.+)", match):
                kb.targetUrls.add(( htmlunescape(htmlunescape(match)), None, None, None ))
            elif re.search(URI_INJECTABLE_REGEX, match, re.I):
                kb.targetUrls.add(( htmlunescape(htmlunescape("%s" % match)), None, None, None ))
--- a/sqlmap.conf
+++ b/sqlmap.conf
@ -543,6 +543,10 @@ checkPayload = False
 # Valid: True or False
 cleanup = False
 # Crawl the website starting from the target url
 # Valid: True or False
 crawl = False
 # Parse and test forms on target url
 # Valid: True or False
 forms = False