sqlmap/lib/utils/crawler.py

137 lines
5.6 KiB
Python
Raw Normal View History

#!/usr/bin/env python
"""
$Id$
2011-07-08 00:10:03 +04:00
Copyright (c) 2006-2011 sqlmap developers (http://www.sqlmap.org/)
See the file 'doc/COPYING' for copying permission
"""
import httplib
import re
import threading
import urlparse
2011-06-20 18:27:24 +04:00
import time
2011-06-20 18:27:24 +04:00
from lib.core.common import clearConsoleLine
from lib.core.common import dataToStdout
from lib.core.common import findPageForms
2011-06-24 23:50:13 +04:00
from lib.core.common import singleTimeWarnMessage
from lib.core.data import conf
from lib.core.data import kb
from lib.core.data import logger
from lib.core.exception import sqlmapConnectionException
2011-06-21 02:41:38 +04:00
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads
from lib.request.connect import Connect as Request
from extra.beautifulsoup.beautifulsoup import BeautifulSoup
from extra.oset.pyoset import oset
class Crawler:
"""
This class defines methods used to perform crawling (command
line option '--crawl'
"""
2011-06-24 09:40:03 +04:00
def getTargetUrls(self):
try:
threadData = getCurrentThreadData()
threadData.shared.outputs = oset()
def crawlThread():
threadData = getCurrentThreadData()
while kb.threadContinue:
kb.locks.limits.acquire()
if threadData.shared.unprocessed:
current = threadData.shared.unprocessed.pop()
kb.locks.limits.release()
else:
kb.locks.limits.release()
break
2011-06-21 02:41:38 +04:00
content = None
2011-06-21 01:47:03 +04:00
try:
if current:
content = Request.getPage(url=current, crawling=True, raise404=False)[0]
2011-06-21 01:47:03 +04:00
except sqlmapConnectionException, e:
errMsg = "connection exception detected (%s). skipping " % e
errMsg += "url '%s'" % current
logger.critical(errMsg)
except httplib.InvalidURL, e:
errMsg = "invalid url detected (%s). skipping " % e
errMsg += "url '%s'" % current
logger.critical(errMsg)
if not kb.threadContinue:
break
2011-06-21 02:41:38 +04:00
if isinstance(content, unicode):
2011-12-20 14:34:28 +04:00
try:
soup = BeautifulSoup(content)
for tag in soup('a'):
if tag.get("href"):
url = urlparse.urljoin(conf.url, tag.get("href"))
# flag to know if we are dealing with the same target host
target = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], [url, conf.url]))
if conf.scope:
if not re.search(conf.scope, url, re.I):
continue
elif not target:
continue
2011-12-20 14:34:28 +04:00
if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
kb.locks.outputs.acquire()
threadData.shared.deeper.add(url)
if re.search(r"(.*?)\?(.+)", url):
threadData.shared.outputs.add(url)
kb.locks.outputs.release()
except UnicodeEncodeError: # for non-HTML files
pass
finally:
if conf.forms:
findPageForms(content, current, False, True)
2011-06-20 18:27:24 +04:00
if conf.verbose in (1, 2):
threadData.shared.count += 1
status = '%d/%d links visited (%d%s)' % (threadData.shared.count, threadData.shared.length, round(100.0*threadData.shared.count/threadData.shared.length), '%')
dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
threadData.shared.deeper = set()
threadData.shared.unprocessed = set([conf.url])
2011-06-20 15:46:23 +04:00
logger.info("starting crawler")
2011-06-24 09:40:03 +04:00
for i in xrange(conf.crawlDepth):
2011-06-24 23:50:13 +04:00
if i > 0 and conf.threads == 1:
singleTimeWarnMessage("running in a single-thread mode. This could take a while.")
2011-06-20 18:27:24 +04:00
threadData.shared.count = 0
threadData.shared.length = len(threadData.shared.unprocessed)
numThreads = min(conf.threads, len(threadData.shared.unprocessed))
2011-06-20 18:27:24 +04:00
logger.info("searching for links with depth %d" % (i + 1))
runThreads(numThreads, crawlThread)
2011-06-20 18:27:24 +04:00
clearConsoleLine(True)
2011-06-21 01:57:53 +04:00
if threadData.shared.deeper:
threadData.shared.unprocessed = set(threadData.shared.deeper)
else:
break
except KeyboardInterrupt:
warnMsg = "user aborted during crawling. sqlmap "
warnMsg += "will use partial list"
logger.warn(warnMsg)
finally:
2011-06-20 18:27:24 +04:00
clearConsoleLine(True)
2011-06-20 17:53:39 +04:00
if not threadData.shared.outputs:
warnMsg = "no usable links found (with GET parameters)"
logger.warn(warnMsg)
else:
for url in threadData.shared.outputs:
kb.targetUrls.add(( url, None, None, None ))
kb.suppressResumeInfo = False