sqlmap/lib/utils/crawler.py

#!/usr/bin/env python2

"""
Copyright (c) 2006-2019 sqlmap developers (http://sqlmap.org/)
See the file 'LICENSE' for copying permission
"""

import os
import re
import tempfile
import time

from lib.core.common import checkSameHost
from lib.core.common import clearConsoleLine
from lib.core.common import dataToStdout
from lib.core.common import findPageForms
from lib.core.common import getSafeExString
from lib.core.common import openFile
from lib.core.common import readInput
from lib.core.common import safeCSValue
from lib.core.common import urldecode
from lib.core.compat import xrange
from lib.core.data import conf
from lib.core.data import kb
from lib.core.data import logger
from lib.core.datatype import OrderedSet
from lib.core.enums import MKSTEMP_PREFIX
from lib.core.exception import SqlmapConnectionException
from lib.core.exception import SqlmapSyntaxException
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads
from lib.parse.sitemap import parseSitemap
from lib.request.connect import Connect as Request
from thirdparty import six
from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
from thirdparty.six.moves import http_client as _http_client
from thirdparty.six.moves import urllib as _urllib

def crawl(target):
    try:
        visited = set()
        threadData = getCurrentThreadData()
        threadData.shared.value = OrderedSet()

        def crawlThread():
            threadData = getCurrentThreadData()

            while kb.threadContinue:
                with kb.locks.limit:
                    if threadData.shared.unprocessed:
                        current = threadData.shared.unprocessed.pop()
                        if current in visited:
                            continue
                        elif conf.crawlExclude and re.search(conf.crawlExclude, current):
                            dbgMsg = "skipping '%s'" % current
                            logger.debug(dbgMsg)
                            continue
                        else:
                            visited.add(current)
                    else:
                        break

                content = None
                try:
                    if current:
                        content = Request.getPage(url=current, crawling=True, raise404=False)[0]
                except SqlmapConnectionException as ex:
                    errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex)
                    errMsg += "URL '%s'" % current
                    logger.critical(errMsg)
                except SqlmapSyntaxException:
                    errMsg = "invalid URL detected. skipping '%s'" % current
                    logger.critical(errMsg)
                except _http_client.InvalidURL as ex:
                    errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex)
                    errMsg += "URL '%s'" % current
                    logger.critical(errMsg)

                if not kb.threadContinue:
                    break

                if isinstance(content, six.text_type):
                    try:
                        match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
                        if match:
                            content = "<html>%s</html>" % match.group(1)

                        soup = BeautifulSoup(content)
                        tags = soup('a')

                        if not tags:
                            tags = re.finditer(r'(?i)<a[^>]+href="(?P<href>[^>"]+)"', content)

                        for tag in tags:
                            href = tag.get("href") if hasattr(tag, "get") else tag.group("href")

                            if href:
                                if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
                                    current = threadData.lastRedirectURL[1]
                                url = _urllib.parse.urljoin(current, href)

                                # flag to know if we are dealing with the same target host
                                _ = checkSameHost(url, target)

                                if conf.scope:
                                    if not re.search(conf.scope, url, re.I):
                                        continue
                                elif not _:
                                    continue

                                if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
                                    with kb.locks.value:
                                        threadData.shared.deeper.add(url)
                                        if re.search(r"(.*?)\?(.+)", url):
                                            threadData.shared.value.add(url)
                    except UnicodeEncodeError:  # for non-HTML files
                        pass
                    except ValueError:          # for non-valid links
                        pass
                    finally:
                        if conf.forms:
                            findPageForms(content, current, False, True)

                if conf.verbose in (1, 2):
                    threadData.shared.count += 1
                    status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
                    dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)

        threadData.shared.deeper = set()
        threadData.shared.unprocessed = set([target])

        if not conf.sitemapUrl:
            message = "do you want to check for the existence of "
            message += "site's sitemap(.xml) [y/N] "

            if readInput(message, default='N', boolean=True):
                found = True
                items = None
                url = _urllib.parse.urljoin(target, "/sitemap.xml")
                try:
                    items = parseSitemap(url)
                except SqlmapConnectionException as ex:
                    if "page not found" in getSafeExString(ex):
                        found = False
                        logger.warn("'sitemap.xml' not found")
                except:
                    pass
                finally:
                    if found:
                        if items:
                            for item in items:
                                if re.search(r"(.*?)\?(.+)", item):
                                    threadData.shared.value.add(item)
                            if conf.crawlDepth > 1:
                                threadData.shared.unprocessed.update(items)
                        logger.info("%s links found" % ("no" if not items else len(items)))

        infoMsg = "starting crawler"
        if conf.bulkFile:
            infoMsg += " for target URL '%s'" % target
        logger.info(infoMsg)

        for i in xrange(conf.crawlDepth):
            threadData.shared.count = 0
            threadData.shared.length = len(threadData.shared.unprocessed)
            numThreads = min(conf.threads, len(threadData.shared.unprocessed))

            if not conf.bulkFile:
                logger.info("searching for links with depth %d" % (i + 1))

            runThreads(numThreads, crawlThread, threadChoice=(i > 0))
            clearConsoleLine(True)

            if threadData.shared.deeper:
                threadData.shared.unprocessed = set(threadData.shared.deeper)
            else:
                break

    except KeyboardInterrupt:
        warnMsg = "user aborted during crawling. sqlmap "
        warnMsg += "will use partial list"
        logger.warn(warnMsg)

    finally:
        clearConsoleLine(True)

        if not threadData.shared.value:
            warnMsg = "no usable links found (with GET parameters)"
            logger.warn(warnMsg)
        else:
            for url in threadData.shared.value:
                kb.targets.add((urldecode(url, kb.pageEncoding), None, None, None, None))

        storeResultsToFile(kb.targets)

def storeResultsToFile(results):
    if not results:
        return

    if kb.storeCrawlingChoice is None:
        message = "do you want to store crawling results to a temporary file "
        message += "for eventual further processing with other tools [y/N] "

        kb.storeCrawlingChoice = readInput(message, default='N', boolean=True)

    if kb.storeCrawlingChoice:
        handle, filename = tempfile.mkstemp(prefix=MKSTEMP_PREFIX.CRAWLER, suffix=".csv" if conf.forms else ".txt")
        os.close(handle)

        infoMsg = "writing crawling results to a temporary file '%s' " % filename
        logger.info(infoMsg)

        with openFile(filename, "w+b") as f:
            if conf.forms:
                f.write("URL,POST\n")

            for url, _, data, _, _ in results:
                if conf.forms:
                    f.write("%s,%s\n" % (safeCSValue(url), safeCSValue(data or "")))
                else:
                    f.write("%s\n" % url)
Update regarding #2940 (PEP 394) 2019-03-21 16:00:09 +03:00			`#!/usr/bin/env python2`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
			`"""`
update_copyright_year() 2019-01-05 23:38:52 +03:00			`Copyright (c) 2006-2019 sqlmap developers (http://sqlmap.org/)`
Replacing doc/COPYING to LICENSE 2017-10-11 15:50:46 +03:00			`See the file 'LICENSE' for copying permission`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`"""`

Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00			`import os`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`import re`
Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00			`import tempfile`
minor update 2011-06-20 18:27:24 +04:00			`import time`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Minor patches (and one bug from ML) 2016-12-20 11:53:44 +03:00			`from lib.core.common import checkSameHost`
minor update 2011-06-20 18:27:24 +04:00			`from lib.core.common import clearConsoleLine`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.core.common import dataToStdout`
adding compatibility support for using --crawl and --forms together 2011-10-29 13:32:20 +04:00			`from lib.core.common import findPageForms`
Adding new warning message 2015-11-08 01:30:24 +03:00			`from lib.core.common import getSafeExString`
Patch for an Issue #976 2014-11-26 15:38:21 +03:00			`from lib.core.common import openFile`
Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00			`from lib.core.common import readInput`
			`from lib.core.common import safeCSValue`
Fixes #2501 2017-04-24 00:50:30 +03:00			`from lib.core.common import urldecode`
Some more DREI stuff 2019-03-28 18:04:38 +03:00			`from lib.core.compat import xrange`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.core.data import conf`
			`from lib.core.data import kb`
			`from lib.core.data import logger`
Foo and fo 2019-03-27 17:48:51 +03:00			`from lib.core.datatype import OrderedSet`
Minor refactoring 2016-05-31 14:02:26 +03:00			`from lib.core.enums import MKSTEMP_PREFIX`
Doing some more style updating (capitalization of exception classes; using _ is enough for private members - __ is used in Python specific methods) 2012-12-06 17:14:19 +04:00			`from lib.core.exception import SqlmapConnectionException`
Fixes #1444 2015-10-05 17:33:10 +03:00			`from lib.core.exception import SqlmapSyntaxException`
crawler fix (skip binary files) 2011-06-21 02:41:38 +04:00			`from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.core.threads import getCurrentThreadData`
			`from lib.core.threads import runThreads`
Automatically checking for sitemap existence in case of --crawl 2015-01-20 12:03:35 +03:00			`from lib.parse.sitemap import parseSitemap`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.request.connect import Connect as Request`
Some more DREI stuff 2019-04-19 12:24:34 +03:00			`from thirdparty import six`
More work for Issue #66 2012-07-14 19:01:04 +04:00			`from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup`
God help us all with this Python3 non-sense 2019-03-27 15:33:46 +03:00			`from thirdparty.six.moves import http_client as _http_client`
			`from thirdparty.six.moves import urllib as _urllib`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`def crawl(target):`
			`try:`
Minor update for crawler 2013-04-30 20:32:46 +04:00			`visited = set()`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`threadData = getCurrentThreadData()`
Foo and fo 2019-03-27 17:48:51 +03:00			`threadData.shared.value = OrderedSet()`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00
			`def crawlThread():`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`threadData = getCurrentThreadData()`

Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`while kb.threadContinue:`
			`with kb.locks.limit:`
			`if threadData.shared.unprocessed:`
			`current = threadData.shared.unprocessed.pop()`
Minor update for crawler 2013-04-30 20:32:46 +04:00			`if current in visited:`
			`continue`
Implements #1215 2015-04-06 23:07:22 +03:00			`elif conf.crawlExclude and re.search(conf.crawlExclude, current):`
			`dbgMsg = "skipping '%s'" % current`
			`logger.debug(dbgMsg)`
			`continue`
Minor update for crawler 2013-04-30 20:32:46 +04:00			`else:`
			`visited.add(current)`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`else:`
			`break`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`content = None`
			`try:`
			`if current:`
			`content = Request.getPage(url=current, crawling=True, raise404=False)[0]`
Baby steps (2 to 3 at a time) 2019-01-22 02:40:48 +03:00			`except SqlmapConnectionException as ex:`
Minor beauty patch 2017-11-06 12:36:17 +03:00			`errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex)`
Style and consistency update (url -> URL) 2013-04-09 13:48:42 +04:00			`errMsg += "URL '%s'" % current`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`logger.critical(errMsg)`
Fixes #1444 2015-10-05 17:33:10 +03:00			`except SqlmapSyntaxException:`
			`errMsg = "invalid URL detected. skipping '%s'" % current`
			`logger.critical(errMsg)`
God help us all with this Python3 non-sense 2019-03-27 15:33:46 +03:00			`except _http_client.InvalidURL as ex:`
Minor beauty patch 2017-11-06 12:36:17 +03:00			`errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex)`
Style and consistency update (url -> URL) 2013-04-09 13:48:42 +04:00			`errMsg += "URL '%s'" % current`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`logger.critical(errMsg)`

			`if not kb.threadContinue:`
			`break`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Some more DREI stuff 2019-04-19 12:24:34 +03:00			`if isinstance(content, six.text_type):`
minor update 2011-06-21 01:47:03 +04:00			`try:`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)`
			`if match:`
			`content = "<html>%s</html>" % match.group(1)`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`soup = BeautifulSoup(content)`
			`tags = soup('a')`
Fix for an Issue #324 (crawling when HTML is not well-formed) 2012-12-27 23:55:37 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if not tags:`
Minor patches (pydiatra) 2017-04-14 14:08:51 +03:00			`tags = re.finditer(r'(?i)<a[^>]+href="(?P<href>[^>"]+)"', content)`
Fix for an Issue #324 (crawling when HTML is not well-formed) 2012-12-27 23:55:37 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`for tag in tags:`
			`href = tag.get("href") if hasattr(tag, "get") else tag.group("href")`
Fix for an Issue #324 (crawling when HTML is not well-formed) 2012-12-27 23:55:37 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if href:`
Fix for crawler and redirection case 2013-04-30 20:08:26 +04:00			`if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:`
			`current = threadData.lastRedirectURL[1]`
God help us all with this Python3 non-sense 2019-03-27 15:33:46 +03:00			`url = _urllib.parse.urljoin(current, href)`
Fix for an Issue #324 (crawling when HTML is not well-formed) 2012-12-27 23:55:37 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`# flag to know if we are dealing with the same target host`
Minor patches (and one bug from ML) 2016-12-20 11:53:44 +03:00			`_ = checkSameHost(url, target)`
removing of unused imports together with some general code refactoring 2012-02-22 14:40:11 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if conf.scope:`
			`if not re.search(conf.scope, url, re.I):`
minor fix for crawler and far less message overlaps in future 2011-06-21 01:18:12 +04:00			`continue`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`elif not _:`
			`continue`

			`if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:`
			`with kb.locks.value:`
			`threadData.shared.deeper.add(url)`
			`if re.search(r"(.*?)\?(.+)", url):`
			`threadData.shared.value.add(url)`
Some PEP8 related style cleaning 2013-01-10 16:18:44 +04:00			`except UnicodeEncodeError: # for non-HTML files`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`pass`
Minor updates 2017-07-05 14:51:48 +03:00			`except ValueError: # for non-valid links`
			`pass`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`finally:`
			`if conf.forms:`
			`findPageForms(content, current, False, True)`

			`if conf.verbose in (1, 2):`
			`threadData.shared.count += 1`
Minor style update 2013-01-09 19:10:26 +04:00			`status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)`

			`threadData.shared.deeper = set()`
Reverting set() brace form because of Python 2.6 compatibility issues 2018-01-31 13:24:28 +03:00			`threadData.shared.unprocessed = set([target])`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00
Automatically checking for sitemap existence in case of --crawl 2015-01-20 12:03:35 +03:00			`if not conf.sitemapUrl:`
			`message = "do you want to check for the existence of "`
Changing default answer for sitemap checking to N 2015-04-14 10:30:01 +03:00			`message += "site's sitemap(.xml) [y/N] "`
Some code refactoring 2017-04-18 16:48:05 +03:00
			`if readInput(message, default='N', boolean=True):`
Adding new warning message 2015-11-08 01:30:24 +03:00			`found = True`
Automatically checking for sitemap existence in case of --crawl 2015-01-20 12:03:35 +03:00			`items = None`
God help us all with this Python3 non-sense 2019-03-27 15:33:46 +03:00			`url = _urllib.parse.urljoin(target, "/sitemap.xml")`
Automatically checking for sitemap existence in case of --crawl 2015-01-20 12:03:35 +03:00			`try:`
			`items = parseSitemap(url)`
Baby steps (2 to 3 at a time) 2019-01-22 02:40:48 +03:00			`except SqlmapConnectionException as ex:`
Adding new warning message 2015-11-08 01:30:24 +03:00			`if "page not found" in getSafeExString(ex):`
			`found = False`
			`logger.warn("'sitemap.xml' not found")`
Automatically checking for sitemap existence in case of --crawl 2015-01-20 12:03:35 +03:00			`except:`
			`pass`
			`finally:`
Adding new warning message 2015-11-08 01:30:24 +03:00			`if found:`
			`if items:`
			`for item in items:`
			`if re.search(r"(.*?)\?(.+)", item):`
			`threadData.shared.value.add(item)`
			`if conf.crawlDepth > 1:`
			`threadData.shared.unprocessed.update(items)`
			`logger.info("%s links found" % ("no" if not items else len(items)))`
Automatically checking for sitemap existence in case of --crawl 2015-01-20 12:03:35 +03:00
Update for an Issue #429 2013-04-09 13:36:33 +04:00			`infoMsg = "starting crawler"`
			`if conf.bulkFile:`
			`infoMsg += " for target URL '%s'" % target`
			`logger.info(infoMsg)`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00
			`for i in xrange(conf.crawlDepth):`
			`threadData.shared.count = 0`
			`threadData.shared.length = len(threadData.shared.unprocessed)`
			`numThreads = min(conf.threads, len(threadData.shared.unprocessed))`
Update for an Issue #429 2013-04-09 13:36:33 +04:00
			`if not conf.bulkFile:`
			`logger.info("searching for links with depth %d" % (i + 1))`

Some more PEPing (I hope that I haven't broke anything) 2018-03-13 15:45:42 +03:00			`runThreads(numThreads, crawlThread, threadChoice=(i > 0))`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`clearConsoleLine(True)`
Update for an Issue #429 2013-04-09 13:36:33 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if threadData.shared.deeper:`
			`threadData.shared.unprocessed = set(threadData.shared.deeper)`
			`else:`
			`break`
removing of unused imports together with some general code refactoring 2012-02-22 14:40:11 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`except KeyboardInterrupt:`
			`warnMsg = "user aborted during crawling. sqlmap "`
			`warnMsg += "will use partial list"`
			`logger.warn(warnMsg)`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`finally:`
			`clearConsoleLine(True)`
minor update 2011-06-20 18:27:24 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if not threadData.shared.value:`
			`warnMsg = "no usable links found (with GET parameters)"`
			`logger.warn(warnMsg)`
			`else:`
			`for url in threadData.shared.value:`
Fixes #2501 2017-04-24 00:50:30 +03:00			`kb.targets.add((urldecode(url, kb.pageEncoding), None, None, None, None))`
Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00
			`storeResultsToFile(kb.targets)`

			`def storeResultsToFile(results):`
			`if not results:`
			`return`

			`if kb.storeCrawlingChoice is None:`
			`message = "do you want to store crawling results to a temporary file "`
			`message += "for eventual further processing with other tools [y/N] "`
Some code refactoring 2017-04-18 16:48:05 +03:00
			`kb.storeCrawlingChoice = readInput(message, default='N', boolean=True)`
Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00
			`if kb.storeCrawlingChoice:`
Minor refactoring 2016-05-31 14:02:26 +03:00			`handle, filename = tempfile.mkstemp(prefix=MKSTEMP_PREFIX.CRAWLER, suffix=".csv" if conf.forms else ".txt")`
Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00			`os.close(handle)`

			`infoMsg = "writing crawling results to a temporary file '%s' " % filename`
			`logger.info(infoMsg)`

Patch for an Issue #976 2014-11-26 15:38:21 +03:00			`with openFile(filename, "w+b") as f:`
Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00			`if conf.forms:`
			`f.write("URL,POST\n")`

			`for url, _, data, _, _ in results:`
			`if conf.forms:`
			`f.write("%s,%s\n" % (safeCSValue(url), safeCSValue(data or "")))`
			`else:`
			`f.write("%s\n" % url)`