sqlmap/lib/utils/crawler.py

150 lines
5.9 KiB
Python
Raw Normal View History

#!/usr/bin/env python
"""
2014-01-13 21:24:49 +04:00
Copyright (c) 2006-2014 sqlmap developers (http://sqlmap.org/)
See the file 'doc/COPYING' for copying permission
"""
import httplib
import re
import urlparse
2011-06-20 18:27:24 +04:00
import time
2011-06-20 18:27:24 +04:00
from lib.core.common import clearConsoleLine
from lib.core.common import dataToStdout
from lib.core.common import findPageForms
2011-06-24 23:50:13 +04:00
from lib.core.common import singleTimeWarnMessage
from lib.core.data import conf
from lib.core.data import kb
from lib.core.data import logger
from lib.core.exception import SqlmapConnectionException
2011-06-21 02:41:38 +04:00
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads
from lib.request.connect import Connect as Request
2012-07-14 19:01:04 +04:00
from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
from thirdparty.oset.pyoset import oset
2013-01-09 18:22:21 +04:00
def crawl(target):
try:
2013-04-30 20:32:46 +04:00
visited = set()
2013-01-09 18:22:21 +04:00
threadData = getCurrentThreadData()
threadData.shared.value = oset()
def crawlThread():
threadData = getCurrentThreadData()
2013-01-09 18:22:21 +04:00
while kb.threadContinue:
with kb.locks.limit:
if threadData.shared.unprocessed:
current = threadData.shared.unprocessed.pop()
2013-04-30 20:32:46 +04:00
if current in visited:
continue
else:
visited.add(current)
2013-01-09 18:22:21 +04:00
else:
break
2013-01-09 18:22:21 +04:00
content = None
try:
if current:
content = Request.getPage(url=current, crawling=True, raise404=False)[0]
except SqlmapConnectionException, e:
errMsg = "connection exception detected (%s). skipping " % e
errMsg += "URL '%s'" % current
2013-01-09 18:22:21 +04:00
logger.critical(errMsg)
except httplib.InvalidURL, e:
errMsg = "invalid URL detected (%s). skipping " % e
errMsg += "URL '%s'" % current
2013-01-09 18:22:21 +04:00
logger.critical(errMsg)
if not kb.threadContinue:
break
2013-01-09 18:22:21 +04:00
if isinstance(content, unicode):
2011-06-21 01:47:03 +04:00
try:
2013-01-09 18:22:21 +04:00
match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
if match:
content = "<html>%s</html>" % match.group(1)
2013-01-09 18:22:21 +04:00
soup = BeautifulSoup(content)
tags = soup('a')
2013-01-09 18:22:21 +04:00
if not tags:
tags = re.finditer(r'(?si)<a[^>]+href="(?P<href>[^>"]+)"', content)
2013-01-09 18:22:21 +04:00
for tag in tags:
href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
2013-01-09 18:22:21 +04:00
if href:
2013-04-30 20:08:26 +04:00
if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
current = threadData.lastRedirectURL[1]
2013-04-30 18:40:16 +04:00
url = urlparse.urljoin(current, href)
2013-01-09 18:22:21 +04:00
# flag to know if we are dealing with the same target host
_ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, target)))
2013-01-09 18:22:21 +04:00
if conf.scope:
if not re.search(conf.scope, url, re.I):
continue
2013-01-09 18:22:21 +04:00
elif not _:
continue
if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
with kb.locks.value:
threadData.shared.deeper.add(url)
if re.search(r"(.*?)\?(.+)", url):
threadData.shared.value.add(url)
2013-01-10 16:18:44 +04:00
except UnicodeEncodeError: # for non-HTML files
2013-01-09 18:22:21 +04:00
pass
finally:
if conf.forms:
findPageForms(content, current, False, True)
if conf.verbose in (1, 2):
threadData.shared.count += 1
2013-01-09 19:10:26 +04:00
status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
2013-01-09 18:22:21 +04:00
dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
threadData.shared.deeper = set()
threadData.shared.unprocessed = set([target])
2013-04-09 13:36:33 +04:00
infoMsg = "starting crawler"
if conf.bulkFile:
infoMsg += " for target URL '%s'" % target
logger.info(infoMsg)
2013-01-09 18:22:21 +04:00
for i in xrange(conf.crawlDepth):
if i > 0 and conf.threads == 1:
2013-04-09 12:42:58 +04:00
singleTimeWarnMessage("running in a single-thread mode. This could take a while")
2013-04-09 13:36:33 +04:00
2013-01-09 18:22:21 +04:00
threadData.shared.count = 0
threadData.shared.length = len(threadData.shared.unprocessed)
numThreads = min(conf.threads, len(threadData.shared.unprocessed))
2013-04-09 13:36:33 +04:00
if not conf.bulkFile:
logger.info("searching for links with depth %d" % (i + 1))
2013-01-09 18:22:21 +04:00
runThreads(numThreads, crawlThread)
clearConsoleLine(True)
2013-04-09 13:36:33 +04:00
2013-01-09 18:22:21 +04:00
if threadData.shared.deeper:
threadData.shared.unprocessed = set(threadData.shared.deeper)
else:
break
2013-01-09 18:22:21 +04:00
except KeyboardInterrupt:
warnMsg = "user aborted during crawling. sqlmap "
warnMsg += "will use partial list"
logger.warn(warnMsg)
2013-01-09 18:22:21 +04:00
finally:
clearConsoleLine(True)
2011-06-20 18:27:24 +04:00
2013-01-09 18:22:21 +04:00
if not threadData.shared.value:
warnMsg = "no usable links found (with GET parameters)"
logger.warn(warnMsg)
else:
for url in threadData.shared.value:
kb.targets.add((url, None, None, None))