sqlmap/lib/utils/crawler.py

266 lines
11 KiB
Python
Raw Permalink Normal View History

2019-05-08 13:47:52 +03:00
#!/usr/bin/env python
"""
2023-01-03 01:24:59 +03:00
Copyright (c) 2006-2023 sqlmap developers (https://sqlmap.org/)
2017-10-11 15:50:46 +03:00
See the file 'LICENSE' for copying permission
"""
2019-06-04 13:15:39 +03:00
from __future__ import division
import os
import re
import tempfile
2011-06-20 18:27:24 +04:00
import time
2016-12-20 11:53:44 +03:00
from lib.core.common import checkSameHost
2011-06-20 18:27:24 +04:00
from lib.core.common import clearConsoleLine
from lib.core.common import dataToStdout
2019-10-02 14:08:13 +03:00
from lib.core.common import extractRegexResult
from lib.core.common import findPageForms
2015-11-08 01:30:24 +03:00
from lib.core.common import getSafeExString
2014-11-26 15:38:21 +03:00
from lib.core.common import openFile
from lib.core.common import readInput
from lib.core.common import safeCSValue
2017-04-24 00:50:30 +03:00
from lib.core.common import urldecode
2019-03-28 18:04:38 +03:00
from lib.core.compat import xrange
2019-05-20 12:24:43 +03:00
from lib.core.convert import htmlUnescape
from lib.core.data import conf
from lib.core.data import kb
from lib.core.data import logger
2019-03-27 17:48:51 +03:00
from lib.core.datatype import OrderedSet
2016-05-31 14:02:26 +03:00
from lib.core.enums import MKSTEMP_PREFIX
from lib.core.exception import SqlmapConnectionException
2015-10-05 17:33:10 +03:00
from lib.core.exception import SqlmapSyntaxException
2011-06-21 02:41:38 +04:00
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads
from lib.parse.sitemap import parseSitemap
from lib.request.connect import Connect as Request
2019-04-19 12:24:34 +03:00
from thirdparty import six
2012-07-14 19:01:04 +04:00
from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
from thirdparty.six.moves import http_client as _http_client
from thirdparty.six.moves import urllib as _urllib
2019-11-15 12:02:51 +03:00
def crawl(target, post=None, cookie=None):
2019-11-15 11:50:00 +03:00
if not target:
return
2013-01-09 18:22:21 +04:00
try:
2013-04-30 20:32:46 +04:00
visited = set()
2013-01-09 18:22:21 +04:00
threadData = getCurrentThreadData()
2019-03-27 17:48:51 +03:00
threadData.shared.value = OrderedSet()
2019-11-07 18:04:32 +03:00
threadData.shared.formsFound = False
2013-01-09 18:22:21 +04:00
def crawlThread():
threadData = getCurrentThreadData()
2013-01-09 18:22:21 +04:00
while kb.threadContinue:
with kb.locks.limit:
if threadData.shared.unprocessed:
current = threadData.shared.unprocessed.pop()
2013-04-30 20:32:46 +04:00
if current in visited:
continue
2015-04-06 23:07:22 +03:00
elif conf.crawlExclude and re.search(conf.crawlExclude, current):
dbgMsg = "skipping '%s'" % current
logger.debug(dbgMsg)
continue
2013-04-30 20:32:46 +04:00
else:
visited.add(current)
2013-01-09 18:22:21 +04:00
else:
break
2013-01-09 18:22:21 +04:00
content = None
try:
if current:
2019-11-15 12:02:51 +03:00
content = Request.getPage(url=current, post=post, cookie=None, crawling=True, raise404=False)[0]
2019-01-22 02:40:48 +03:00
except SqlmapConnectionException as ex:
2017-11-06 12:36:17 +03:00
errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex)
errMsg += "URL '%s'" % current
2013-01-09 18:22:21 +04:00
logger.critical(errMsg)
2015-10-05 17:33:10 +03:00
except SqlmapSyntaxException:
errMsg = "invalid URL detected. skipping '%s'" % current
logger.critical(errMsg)
except _http_client.InvalidURL as ex:
2017-11-06 12:36:17 +03:00
errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex)
errMsg += "URL '%s'" % current
2013-01-09 18:22:21 +04:00
logger.critical(errMsg)
if not kb.threadContinue:
break
2019-04-19 12:24:34 +03:00
if isinstance(content, six.text_type):
2011-06-21 01:47:03 +04:00
try:
2013-01-09 18:22:21 +04:00
match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
if match:
content = "<html>%s</html>" % match.group(1)
2013-01-09 18:22:21 +04:00
soup = BeautifulSoup(content)
tags = soup('a')
2019-11-01 00:47:36 +03:00
tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content)
2019-11-12 17:38:59 +03:00
tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content)
2013-01-09 18:22:21 +04:00
for tag in tags:
href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
2013-01-09 18:22:21 +04:00
if href:
2013-04-30 20:08:26 +04:00
if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
current = threadData.lastRedirectURL[1]
2019-05-20 12:24:43 +03:00
url = _urllib.parse.urljoin(current, htmlUnescape(href))
2013-01-09 18:22:21 +04:00
# flag to know if we are dealing with the same target host
2016-12-20 11:53:44 +03:00
_ = checkSameHost(url, target)
2013-01-09 18:22:21 +04:00
if conf.scope:
if not re.search(conf.scope, url, re.I):
continue
2013-01-09 18:22:21 +04:00
elif not _:
continue
2019-10-02 14:08:13 +03:00
if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS:
2013-01-09 18:22:21 +04:00
with kb.locks.value:
threadData.shared.deeper.add(url)
2019-11-09 02:54:47 +03:00
if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url):
2013-01-09 18:22:21 +04:00
threadData.shared.value.add(url)
2013-01-10 16:18:44 +04:00
except UnicodeEncodeError: # for non-HTML files
2013-01-09 18:22:21 +04:00
pass
2017-07-05 14:51:48 +03:00
except ValueError: # for non-valid links
pass
2022-05-09 15:54:28 +03:00
except AssertionError: # for invalid HTML
pass
2013-01-09 18:22:21 +04:00
finally:
if conf.forms:
2019-11-07 18:04:32 +03:00
threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0
2013-01-09 18:22:21 +04:00
if conf.verbose in (1, 2):
threadData.shared.count += 1
2013-01-09 19:10:26 +04:00
status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
2013-01-09 18:22:21 +04:00
dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
threadData.shared.deeper = set()
threadData.shared.unprocessed = set([target])
2013-01-09 18:22:21 +04:00
2019-11-14 16:21:53 +03:00
_ = re.sub(r"(?<!/)/(?!/).*", "", target)
if _:
if target.strip('/') != _.strip('/'):
threadData.shared.unprocessed.add(_)
if re.search(r"\?.*\b\w+=", target):
threadData.shared.value.add(target)
if kb.checkSitemap is None:
message = "do you want to check for the existence of "
message += "site's sitemap(.xml) [y/N] "
kb.checkSitemap = readInput(message, default='N', boolean=True)
if kb.checkSitemap:
found = True
items = None
url = _urllib.parse.urljoin(target, "/sitemap.xml")
try:
items = parseSitemap(url)
except SqlmapConnectionException as ex:
if "page not found" in getSafeExString(ex):
found = False
logger.warning("'sitemap.xml' not found")
except:
pass
finally:
if found:
if items:
for item in items:
if re.search(r"(.*?)\?(.+)", item):
threadData.shared.value.add(item)
if conf.crawlDepth > 1:
threadData.shared.unprocessed.update(items)
logger.info("%s links found" % ("no" if not items else len(items)))
if not conf.bulkFile:
infoMsg = "starting crawler for target URL '%s'" % target
logger.info(infoMsg)
2013-01-09 18:22:21 +04:00
for i in xrange(conf.crawlDepth):
threadData.shared.count = 0
threadData.shared.length = len(threadData.shared.unprocessed)
numThreads = min(conf.threads, len(threadData.shared.unprocessed))
2013-04-09 13:36:33 +04:00
if not conf.bulkFile:
logger.info("searching for links with depth %d" % (i + 1))
runThreads(numThreads, crawlThread, threadChoice=(i > 0))
2013-01-09 18:22:21 +04:00
clearConsoleLine(True)
2013-04-09 13:36:33 +04:00
2013-01-09 18:22:21 +04:00
if threadData.shared.deeper:
threadData.shared.unprocessed = set(threadData.shared.deeper)
else:
break
2013-01-09 18:22:21 +04:00
except KeyboardInterrupt:
warnMsg = "user aborted during crawling. sqlmap "
warnMsg += "will use partial list"
logger.warning(warnMsg)
2013-01-09 18:22:21 +04:00
finally:
clearConsoleLine(True)
2011-06-20 18:27:24 +04:00
2013-01-09 18:22:21 +04:00
if not threadData.shared.value:
2019-11-07 18:04:32 +03:00
if not (conf.forms and threadData.shared.formsFound):
warnMsg = "no usable links found (with GET parameters)"
if conf.forms:
warnMsg += " or forms"
logger.warning(warnMsg)
2013-01-09 18:22:21 +04:00
else:
for url in threadData.shared.value:
2017-04-24 00:50:30 +03:00
kb.targets.add((urldecode(url, kb.pageEncoding), None, None, None, None))
2019-11-06 16:45:48 +03:00
if kb.targets:
if kb.normalizeCrawlingChoice is None:
message = "do you want to normalize "
message += "crawling results [Y/n] "
2019-11-06 16:45:48 +03:00
kb.normalizeCrawlingChoice = readInput(message, default='Y', boolean=True)
2019-11-06 16:45:48 +03:00
if kb.normalizeCrawlingChoice:
seen = set()
results = OrderedSet()
2019-11-06 16:45:48 +03:00
for target in kb.targets:
2019-11-15 18:24:56 +03:00
value = "%s%s%s" % (target[0], '&' if '?' in target[0] else '?', target[2] or "")
2019-11-15 18:27:05 +03:00
match = re.search(r"/[^/?]*\?.+\Z", value)
2019-11-15 18:06:19 +03:00
if match:
2019-11-15 18:27:05 +03:00
key = re.sub(r"=[^=&]*", "=", match.group(0)).strip("&?")
2019-11-15 18:36:21 +03:00
if '=' in key and key not in seen:
2019-11-15 18:06:19 +03:00
results.add(target)
seen.add(key)
2019-11-06 16:45:48 +03:00
kb.targets = results
2019-11-06 16:45:48 +03:00
storeResultsToFile(kb.targets)
def storeResultsToFile(results):
if not results:
return
if kb.storeCrawlingChoice is None:
message = "do you want to store crawling results to a temporary file "
message += "for eventual further processing with other tools [y/N] "
2017-04-18 16:48:05 +03:00
kb.storeCrawlingChoice = readInput(message, default='N', boolean=True)
if kb.storeCrawlingChoice:
2016-05-31 14:02:26 +03:00
handle, filename = tempfile.mkstemp(prefix=MKSTEMP_PREFIX.CRAWLER, suffix=".csv" if conf.forms else ".txt")
os.close(handle)
infoMsg = "writing crawling results to a temporary file '%s' " % filename
logger.info(infoMsg)
2014-11-26 15:38:21 +03:00
with openFile(filename, "w+b") as f:
if conf.forms:
f.write("URL,POST\n")
for url, _, data, _, _ in results:
if conf.forms:
f.write("%s,%s\n" % (safeCSValue(url), safeCSValue(data or "")))
else:
f.write("%s\n" % url)