sqlmap/lib/utils/crawler.py

#!/usr/bin/env python

"""
Copyright (c) 2006-2023 sqlmap developers (https://sqlmap.org/)
See the file 'LICENSE' for copying permission
"""

from __future__ import division

import os
import re
import tempfile
import time

from lib.core.common import checkSameHost
from lib.core.common import clearConsoleLine
from lib.core.common import dataToStdout
from lib.core.common import extractRegexResult
from lib.core.common import findPageForms
from lib.core.common import getSafeExString
from lib.core.common import openFile
from lib.core.common import readInput
from lib.core.common import safeCSValue
from lib.core.common import urldecode
from lib.core.compat import xrange
from lib.core.convert import htmlUnescape
from lib.core.data import conf
from lib.core.data import kb
from lib.core.data import logger
from lib.core.datatype import OrderedSet
from lib.core.enums import MKSTEMP_PREFIX
from lib.core.exception import SqlmapConnectionException
from lib.core.exception import SqlmapSyntaxException
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads
from lib.parse.sitemap import parseSitemap
from lib.request.connect import Connect as Request
from thirdparty import six
from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
from thirdparty.six.moves import http_client as _http_client
from thirdparty.six.moves import urllib as _urllib

def crawl(target, post=None, cookie=None):
    if not target:
        return

    try:
        visited = set()
        threadData = getCurrentThreadData()
        threadData.shared.value = OrderedSet()
        threadData.shared.formsFound = False

        def crawlThread():
            threadData = getCurrentThreadData()

            while kb.threadContinue:
                with kb.locks.limit:
                    if threadData.shared.unprocessed:
                        current = threadData.shared.unprocessed.pop()
                        if current in visited:
                            continue
                        elif conf.crawlExclude and re.search(conf.crawlExclude, current):
                            dbgMsg = "skipping '%s'" % current
                            logger.debug(dbgMsg)
                            continue
                        else:
                            visited.add(current)
                    else:
                        break

                content = None
                try:
                    if current:
                        content = Request.getPage(url=current, post=post, cookie=None, crawling=True, raise404=False)[0]
                except SqlmapConnectionException as ex:
                    errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex)
                    errMsg += "URL '%s'" % current
                    logger.critical(errMsg)
                except SqlmapSyntaxException:
                    errMsg = "invalid URL detected. skipping '%s'" % current
                    logger.critical(errMsg)
                except _http_client.InvalidURL as ex:
                    errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex)
                    errMsg += "URL '%s'" % current
                    logger.critical(errMsg)

                if not kb.threadContinue:
                    break

                if isinstance(content, six.text_type):
                    try:
                        match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
                        if match:
                            content = "<html>%s</html>" % match.group(1)

                        soup = BeautifulSoup(content)
                        tags = soup('a')

                        tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content)
                        tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content)

                        for tag in tags:
                            href = tag.get("href") if hasattr(tag, "get") else tag.group("href")

                            if href:
                                if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
                                    current = threadData.lastRedirectURL[1]
                                url = _urllib.parse.urljoin(current, htmlUnescape(href))

                                # flag to know if we are dealing with the same target host
                                _ = checkSameHost(url, target)

                                if conf.scope:
                                    if not re.search(conf.scope, url, re.I):
                                        continue
                                elif not _:
                                    continue

                                if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS:
                                    with kb.locks.value:
                                        threadData.shared.deeper.add(url)
                                        if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url):
                                            threadData.shared.value.add(url)
                    except UnicodeEncodeError:  # for non-HTML files
                        pass
                    except ValueError:          # for non-valid links
                        pass
                    except AssertionError:      # for invalid HTML
                        pass
                    finally:
                        if conf.forms:
                            threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0

                if conf.verbose in (1, 2):
                    threadData.shared.count += 1
                    status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
                    dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)

        threadData.shared.deeper = set()
        threadData.shared.unprocessed = set([target])

        _ = re.sub(r"(?<!/)/(?!/).*", "", target)
        if _:
            if target.strip('/') != _.strip('/'):
                threadData.shared.unprocessed.add(_)

        if re.search(r"\?.*\b\w+=", target):
            threadData.shared.value.add(target)

        if kb.checkSitemap is None:
            message = "do you want to check for the existence of "
            message += "site's sitemap(.xml) [y/N] "
            kb.checkSitemap = readInput(message, default='N', boolean=True)

        if kb.checkSitemap:
            found = True
            items = None
            url = _urllib.parse.urljoin(target, "/sitemap.xml")
            try:
                items = parseSitemap(url)
            except SqlmapConnectionException as ex:
                if "page not found" in getSafeExString(ex):
                    found = False
                    logger.warning("'sitemap.xml' not found")
            except:
                pass
            finally:
                if found:
                    if items:
                        for item in items:
                            if re.search(r"(.*?)\?(.+)", item):
                                threadData.shared.value.add(item)
                        if conf.crawlDepth > 1:
                            threadData.shared.unprocessed.update(items)
                    logger.info("%s links found" % ("no" if not items else len(items)))

        if not conf.bulkFile:
            infoMsg = "starting crawler for target URL '%s'" % target
            logger.info(infoMsg)

        for i in xrange(conf.crawlDepth):
            threadData.shared.count = 0
            threadData.shared.length = len(threadData.shared.unprocessed)
            numThreads = min(conf.threads, len(threadData.shared.unprocessed))

            if not conf.bulkFile:
                logger.info("searching for links with depth %d" % (i + 1))

            runThreads(numThreads, crawlThread, threadChoice=(i > 0))
            clearConsoleLine(True)

            if threadData.shared.deeper:
                threadData.shared.unprocessed = set(threadData.shared.deeper)
            else:
                break

    except KeyboardInterrupt:
        warnMsg = "user aborted during crawling. sqlmap "
        warnMsg += "will use partial list"
        logger.warning(warnMsg)

    finally:
        clearConsoleLine(True)

        if not threadData.shared.value:
            if not (conf.forms and threadData.shared.formsFound):
                warnMsg = "no usable links found (with GET parameters)"
                if conf.forms:
                    warnMsg += " or forms"
                logger.warning(warnMsg)
        else:
            for url in threadData.shared.value:
                kb.targets.add((urldecode(url, kb.pageEncoding), None, None, None, None))

        if kb.targets:
            if kb.normalizeCrawlingChoice is None:
                message = "do you want to normalize "
                message += "crawling results [Y/n] "

                kb.normalizeCrawlingChoice = readInput(message, default='Y', boolean=True)

            if kb.normalizeCrawlingChoice:
                seen = set()
                results = OrderedSet()

                for target in kb.targets:
                    value = "%s%s%s" % (target[0], '&' if '?' in target[0] else '?', target[2] or "")
                    match = re.search(r"/[^/?]*\?.+\Z", value)
                    if match:
                        key = re.sub(r"=[^=&]*", "=", match.group(0)).strip("&?")
                        if '=' in key and key not in seen:
                            results.add(target)
                            seen.add(key)

                kb.targets = results

            storeResultsToFile(kb.targets)

def storeResultsToFile(results):
    if not results:
        return

    if kb.storeCrawlingChoice is None:
        message = "do you want to store crawling results to a temporary file "
        message += "for eventual further processing with other tools [y/N] "

        kb.storeCrawlingChoice = readInput(message, default='N', boolean=True)

    if kb.storeCrawlingChoice:
        handle, filename = tempfile.mkstemp(prefix=MKSTEMP_PREFIX.CRAWLER, suffix=".csv" if conf.forms else ".txt")
        os.close(handle)

        infoMsg = "writing crawling results to a temporary file '%s' " % filename
        logger.info(infoMsg)

        with openFile(filename, "w+b") as f:
            if conf.forms:
                f.write("URL,POST\n")

            for url, _, data, _, _ in results:
                if conf.forms:
                    f.write("%s,%s\n" % (safeCSValue(url), safeCSValue(data or "")))
                else:
                    f.write("%s\n" % url)
Last preparations for DREI 2019-05-08 13:47:52 +03:00			`#!/usr/bin/env python`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
			`"""`
Year and version bump 2023-01-03 01:24:59 +03:00			`Copyright (c) 2006-2023 sqlmap developers (https://sqlmap.org/)`
Replacing doc/COPYING to LICENSE 2017-10-11 15:50:46 +03:00			`See the file 'LICENSE' for copying permission`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`"""`

Further pleasing pylint deity 2019-06-04 13:15:39 +03:00			`from __future__ import division`

Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00			`import os`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`import re`
Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00			`import tempfile`
minor update 2011-06-20 18:27:24 +04:00			`import time`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Minor patches (and one bug from ML) 2016-12-20 11:53:44 +03:00			`from lib.core.common import checkSameHost`
minor update 2011-06-20 18:27:24 +04:00			`from lib.core.common import clearConsoleLine`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.core.common import dataToStdout`
Minor improvement for crawling 2019-10-02 14:08:13 +03:00			`from lib.core.common import extractRegexResult`
adding compatibility support for using --crawl and --forms together 2011-10-29 13:32:20 +04:00			`from lib.core.common import findPageForms`
Adding new warning message 2015-11-08 01:30:24 +03:00			`from lib.core.common import getSafeExString`
Patch for an Issue #976 2014-11-26 15:38:21 +03:00			`from lib.core.common import openFile`
Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00			`from lib.core.common import readInput`
			`from lib.core.common import safeCSValue`
Fixes #2501 2017-04-24 00:50:30 +03:00			`from lib.core.common import urldecode`
Some more DREI stuff 2019-03-28 18:04:38 +03:00			`from lib.core.compat import xrange`
Trivial case update 2019-05-20 12:24:43 +03:00			`from lib.core.convert import htmlUnescape`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.core.data import conf`
			`from lib.core.data import kb`
			`from lib.core.data import logger`
Foo and fo 2019-03-27 17:48:51 +03:00			`from lib.core.datatype import OrderedSet`
Minor refactoring 2016-05-31 14:02:26 +03:00			`from lib.core.enums import MKSTEMP_PREFIX`
Doing some more style updating (capitalization of exception classes; using _ is enough for private members - __ is used in Python specific methods) 2012-12-06 17:14:19 +04:00			`from lib.core.exception import SqlmapConnectionException`
Fixes #1444 2015-10-05 17:33:10 +03:00			`from lib.core.exception import SqlmapSyntaxException`
crawler fix (skip binary files) 2011-06-21 02:41:38 +04:00			`from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.core.threads import getCurrentThreadData`
			`from lib.core.threads import runThreads`
Automatically checking for sitemap existence in case of --crawl 2015-01-20 12:03:35 +03:00			`from lib.parse.sitemap import parseSitemap`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.request.connect import Connect as Request`
Some more DREI stuff 2019-04-19 12:24:34 +03:00			`from thirdparty import six`
More work for Issue #66 2012-07-14 19:01:04 +04:00			`from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup`
God help us all with this Python3 non-sense 2019-03-27 15:33:46 +03:00			`from thirdparty.six.moves import http_client as _http_client`
			`from thirdparty.six.moves import urllib as _urllib`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Fixes #4012 2019-11-15 12:02:51 +03:00			`def crawl(target, post=None, cookie=None):`
Quick patch for #4012 2019-11-15 11:50:00 +03:00			`if not target:`
			`return`

Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`try:`
Minor update for crawler 2013-04-30 20:32:46 +04:00			`visited = set()`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`threadData = getCurrentThreadData()`
Foo and fo 2019-03-27 17:48:51 +03:00			`threadData.shared.value = OrderedSet()`
Minor update 2019-11-07 18:04:32 +03:00			`threadData.shared.formsFound = False`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00
			`def crawlThread():`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`threadData = getCurrentThreadData()`

Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`while kb.threadContinue:`
			`with kb.locks.limit:`
			`if threadData.shared.unprocessed:`
			`current = threadData.shared.unprocessed.pop()`
Minor update for crawler 2013-04-30 20:32:46 +04:00			`if current in visited:`
			`continue`
Implements #1215 2015-04-06 23:07:22 +03:00			`elif conf.crawlExclude and re.search(conf.crawlExclude, current):`
			`dbgMsg = "skipping '%s'" % current`
			`logger.debug(dbgMsg)`
			`continue`
Minor update for crawler 2013-04-30 20:32:46 +04:00			`else:`
			`visited.add(current)`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`else:`
			`break`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`content = None`
			`try:`
			`if current:`
Fixes #4012 2019-11-15 12:02:51 +03:00			`content = Request.getPage(url=current, post=post, cookie=None, crawling=True, raise404=False)[0]`
Baby steps (2 to 3 at a time) 2019-01-22 02:40:48 +03:00			`except SqlmapConnectionException as ex:`
Minor beauty patch 2017-11-06 12:36:17 +03:00			`errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex)`
Style and consistency update (url -> URL) 2013-04-09 13:48:42 +04:00			`errMsg += "URL '%s'" % current`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`logger.critical(errMsg)`
Fixes #1444 2015-10-05 17:33:10 +03:00			`except SqlmapSyntaxException:`
			`errMsg = "invalid URL detected. skipping '%s'" % current`
			`logger.critical(errMsg)`
God help us all with this Python3 non-sense 2019-03-27 15:33:46 +03:00			`except _http_client.InvalidURL as ex:`
Minor beauty patch 2017-11-06 12:36:17 +03:00			`errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex)`
Style and consistency update (url -> URL) 2013-04-09 13:48:42 +04:00			`errMsg += "URL '%s'" % current`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`logger.critical(errMsg)`

			`if not kb.threadContinue:`
			`break`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Some more DREI stuff 2019-04-19 12:24:34 +03:00			`if isinstance(content, six.text_type):`
minor update 2011-06-21 01:47:03 +04:00			`try:`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)`
			`if match:`
			`content = "<html>%s</html>" % match.group(1)`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`soup = BeautifulSoup(content)`
			`tags = soup('a')`
Fix for an Issue #324 (crawling when HTML is not well-formed) 2012-12-27 23:55:37 +04:00
Minor update 2019-11-01 00:47:36 +03:00			`tags += re.finditer(r'(?i)\s(href\|src)=["\'](?P<href>[^>"\']+)', content)`
Minor improvement 2019-11-12 17:38:59 +03:00			`tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content)`
Fix for an Issue #324 (crawling when HTML is not well-formed) 2012-12-27 23:55:37 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`for tag in tags:`
			`href = tag.get("href") if hasattr(tag, "get") else tag.group("href")`
Fix for an Issue #324 (crawling when HTML is not well-formed) 2012-12-27 23:55:37 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if href:`
Fix for crawler and redirection case 2013-04-30 20:08:26 +04:00			`if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:`
			`current = threadData.lastRedirectURL[1]`
Trivial case update 2019-05-20 12:24:43 +03:00			`url = _urllib.parse.urljoin(current, htmlUnescape(href))`
Fix for an Issue #324 (crawling when HTML is not well-formed) 2012-12-27 23:55:37 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`# flag to know if we are dealing with the same target host`
Minor patches (and one bug from ML) 2016-12-20 11:53:44 +03:00			`_ = checkSameHost(url, target)`
removing of unused imports together with some general code refactoring 2012-02-22 14:40:11 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if conf.scope:`
			`if not re.search(conf.scope, url, re.I):`
minor fix for crawler and far less message overlaps in future 2011-06-21 01:18:12 +04:00			`continue`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`elif not _:`
			`continue`

Minor improvement for crawling 2019-10-02 14:08:13 +03:00			`if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?\|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS:`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`with kb.locks.value:`
			`threadData.shared.deeper.add(url)`
Minor update 2019-11-09 02:54:47 +03:00			`if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js\|css)(\?\|\Z)", url):`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`threadData.shared.value.add(url)`
Some PEP8 related style cleaning 2013-01-10 16:18:44 +04:00			`except UnicodeEncodeError: # for non-HTML files`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`pass`
Minor updates 2017-07-05 14:51:48 +03:00			`except ValueError: # for non-valid links`
			`pass`
Fixes #5093 2022-05-09 15:54:28 +03:00			`except AssertionError: # for invalid HTML`
			`pass`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`finally:`
			`if conf.forms:`
Minor update 2019-11-07 18:04:32 +03:00			`threadData.shared.formsFound \|= len(findPageForms(content, current, False, True)) > 0`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00
			`if conf.verbose in (1, 2):`
			`threadData.shared.count += 1`
Minor style update 2013-01-09 19:10:26 +04:00			`status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)`

			`threadData.shared.deeper = set()`
Reverting set() brace form because of Python 2.6 compatibility issues 2018-01-31 13:24:28 +03:00			`threadData.shared.unprocessed = set([target])`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00
Minor improvements 2019-11-14 16:21:53 +03:00			`_ = re.sub(r"(?<!/)/(?!/).*", "", target)`
			`if _:`
			`if target.strip('/') != _.strip('/'):`
			`threadData.shared.unprocessed.add(_)`

			`if re.search(r"\?.*\b\w+=", target):`
			`threadData.shared.value.add(target)`

In case of bulk file, crawl-scan-crawl-scan... 2019-11-05 01:53:35 +03:00			`if kb.checkSitemap is None:`
			`message = "do you want to check for the existence of "`
			`message += "site's sitemap(.xml) [y/N] "`
			`kb.checkSitemap = readInput(message, default='N', boolean=True)`
Removing -x as I doubt that anybody uses it 2019-11-05 00:43:28 +03:00
In case of bulk file, crawl-scan-crawl-scan... 2019-11-05 01:53:35 +03:00			`if kb.checkSitemap:`
Removing -x as I doubt that anybody uses it 2019-11-05 00:43:28 +03:00			`found = True`
			`items = None`
			`url = _urllib.parse.urljoin(target, "/sitemap.xml")`
			`try:`
			`items = parseSitemap(url)`
			`except SqlmapConnectionException as ex:`
			`if "page not found" in getSafeExString(ex):`
			`found = False`
Fixing DeprecationWarning (logger.warn) 2022-06-22 13:04:34 +03:00			`logger.warning("'sitemap.xml' not found")`
Removing -x as I doubt that anybody uses it 2019-11-05 00:43:28 +03:00			`except:`
			`pass`
			`finally:`
			`if found:`
			`if items:`
			`for item in items:`
			`if re.search(r"(.*?)\?(.+)", item):`
			`threadData.shared.value.add(item)`
			`if conf.crawlDepth > 1:`
			`threadData.shared.unprocessed.update(items)`
			`logger.info("%s links found" % ("no" if not items else len(items)))`
Automatically checking for sitemap existence in case of --crawl 2015-01-20 12:03:35 +03:00
In case of bulk file, crawl-scan-crawl-scan... 2019-11-05 01:53:35 +03:00			`if not conf.bulkFile:`
			`infoMsg = "starting crawler for target URL '%s'" % target`
			`logger.info(infoMsg)`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00
			`for i in xrange(conf.crawlDepth):`
			`threadData.shared.count = 0`
			`threadData.shared.length = len(threadData.shared.unprocessed)`
			`numThreads = min(conf.threads, len(threadData.shared.unprocessed))`
Update for an Issue #429 2013-04-09 13:36:33 +04:00
			`if not conf.bulkFile:`
			`logger.info("searching for links with depth %d" % (i + 1))`

Some more PEPing (I hope that I haven't broke anything) 2018-03-13 15:45:42 +03:00			`runThreads(numThreads, crawlThread, threadChoice=(i > 0))`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`clearConsoleLine(True)`
Update for an Issue #429 2013-04-09 13:36:33 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if threadData.shared.deeper:`
			`threadData.shared.unprocessed = set(threadData.shared.deeper)`
			`else:`
			`break`
removing of unused imports together with some general code refactoring 2012-02-22 14:40:11 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`except KeyboardInterrupt:`
			`warnMsg = "user aborted during crawling. sqlmap "`
			`warnMsg += "will use partial list"`
Fixing DeprecationWarning (logger.warn) 2022-06-22 13:04:34 +03:00			`logger.warning(warnMsg)`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`finally:`
			`clearConsoleLine(True)`
minor update 2011-06-20 18:27:24 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if not threadData.shared.value:`
Minor update 2019-11-07 18:04:32 +03:00			`if not (conf.forms and threadData.shared.formsFound):`
			`warnMsg = "no usable links found (with GET parameters)"`
			`if conf.forms:`
			`warnMsg += " or forms"`
Fixing DeprecationWarning (logger.warn) 2022-06-22 13:04:34 +03:00			`logger.warning(warnMsg)`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`else:`
			`for url in threadData.shared.value:`
Fixes #2501 2017-04-24 00:50:30 +03:00			`kb.targets.add((urldecode(url, kb.pageEncoding), None, None, None, None))`
Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00
Minor update 2019-11-06 16:45:48 +03:00			`if kb.targets:`
			`if kb.normalizeCrawlingChoice is None:`
			`message = "do you want to normalize "`
			`message += "crawling results [Y/n] "`
Implementation of crawling results normalization 2019-11-01 00:07:16 +03:00
Minor update 2019-11-06 16:45:48 +03:00			`kb.normalizeCrawlingChoice = readInput(message, default='Y', boolean=True)`
Implementation of crawling results normalization 2019-11-01 00:07:16 +03:00
Minor update 2019-11-06 16:45:48 +03:00			`if kb.normalizeCrawlingChoice:`
			`seen = set()`
			`results = OrderedSet()`
Implementation of crawling results normalization 2019-11-01 00:07:16 +03:00
Minor update 2019-11-06 16:45:48 +03:00			`for target in kb.targets:`
Minor update 2019-11-15 18:24:56 +03:00			`value = "%s%s%s" % (target[0], '&' if '?' in target[0] else '?', target[2] or "")`
Trivial patch 2019-11-15 18:27:05 +03:00			`match = re.search(r"/[^/?]*\?.+\Z", value)`
Minor improvement 2019-11-15 18:06:19 +03:00			`if match:`
Trivial patch 2019-11-15 18:27:05 +03:00			`key = re.sub(r"=[^=&]*", "=", match.group(0)).strip("&?")`
Minor patch 2019-11-15 18:36:21 +03:00			`if '=' in key and key not in seen:`
Minor improvement 2019-11-15 18:06:19 +03:00			`results.add(target)`
			`seen.add(key)`
Implementation of crawling results normalization 2019-11-01 00:07:16 +03:00
Minor update 2019-11-06 16:45:48 +03:00			`kb.targets = results`
Implementation of crawling results normalization 2019-11-01 00:07:16 +03:00
Minor update 2019-11-06 16:45:48 +03:00			`storeResultsToFile(kb.targets)`
Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00
			`def storeResultsToFile(results):`
			`if not results:`
			`return`

			`if kb.storeCrawlingChoice is None:`
			`message = "do you want to store crawling results to a temporary file "`
			`message += "for eventual further processing with other tools [y/N] "`
Some code refactoring 2017-04-18 16:48:05 +03:00
			`kb.storeCrawlingChoice = readInput(message, default='N', boolean=True)`
Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00
			`if kb.storeCrawlingChoice:`
Minor refactoring 2016-05-31 14:02:26 +03:00			`handle, filename = tempfile.mkstemp(prefix=MKSTEMP_PREFIX.CRAWLER, suffix=".csv" if conf.forms else ".txt")`
Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00			`os.close(handle)`

			`infoMsg = "writing crawling results to a temporary file '%s' " % filename`
			`logger.info(infoMsg)`

Patch for an Issue #976 2014-11-26 15:38:21 +03:00			`with openFile(filename, "w+b") as f:`
Storing crawling results to a temporary file (for eventual further processing) 2014-11-20 18:29:17 +03:00			`if conf.forms:`
			`f.write("URL,POST\n")`

			`for url, _, data, _, _ in results:`
			`if conf.forms:`
			`f.write("%s,%s\n" % (safeCSValue(url), safeCSValue(data or "")))`
			`else:`
			`f.write("%s\n" % url)`