sqlmap/lib/utils/crawler.py

#!/usr/bin/env python

"""
Copyright (c) 2006-2012 sqlmap developers (http://sqlmap.org/)
See the file 'doc/COPYING' for copying permission
"""

import httplib
import re
import urlparse
import time

from lib.core.common import clearConsoleLine
from lib.core.common import dataToStdout
from lib.core.common import findPageForms
from lib.core.common import singleTimeWarnMessage
from lib.core.data import conf
from lib.core.data import kb
from lib.core.data import logger
from lib.core.exception import SqlmapConnectionException
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads
from lib.request.connect import Connect as Request
from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
from thirdparty.oset.pyoset import oset

class Crawler(object):
    """
    This class defines methods used to perform crawling (command
    line option '--crawl'
    """

    def getTargetUrls(self):
        try:
            threadData = getCurrentThreadData()
            threadData.shared.outputs = oset()

            def crawlThread():
                threadData = getCurrentThreadData()

                while kb.threadContinue:
                    with kb.locks.limits:
                        if threadData.shared.unprocessed:
                            current = threadData.shared.unprocessed.pop()
                        else:
                            break

                    content = None
                    try:
                        if current:
                            content = Request.getPage(url=current, crawling=True, raise404=False)[0]
                    except SqlmapConnectionException, e:
                        errMsg = "connection exception detected (%s). skipping " % e
                        errMsg += "url '%s'" % current
                        logger.critical(errMsg)
                    except httplib.InvalidURL, e:
                        errMsg = "invalid url detected (%s). skipping " % e
                        errMsg += "url '%s'" % current
                        logger.critical(errMsg)

                    if not kb.threadContinue:
                        break

                    if isinstance(content, unicode):
                        try:
                            soup = BeautifulSoup(content)
                            for tag in soup('a'):
                                if tag.get("href"):
                                    url = urlparse.urljoin(conf.url, tag.get("href"))

                                    # flag to know if we are dealing with the same target host
                                    _ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, conf.url)))

                                    if conf.scope:
                                        if not re.search(conf.scope, url, re.I):
                                            continue
                                    elif not _:
                                        continue

                                    if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
                                        with kb.locks.outputs:
                                            threadData.shared.deeper.add(url)
                                            if re.search(r"(.*?)\?(.+)", url):
                                                threadData.shared.outputs.add(url)
                        except UnicodeEncodeError: # for non-HTML files
                            pass
                        finally:
                            if conf.forms:
                                findPageForms(content, current, False, True)

                    if conf.verbose in (1, 2):
                        threadData.shared.count += 1
                        status = '%d/%d links visited (%d%s)' % (threadData.shared.count, threadData.shared.length, round(100.0*threadData.shared.count/threadData.shared.length), '%')
                        dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)

            threadData.shared.deeper = set()
            threadData.shared.unprocessed = set([conf.url])

            logger.info("starting crawler")

            for i in xrange(conf.crawlDepth):
                if i > 0 and conf.threads == 1:
                    singleTimeWarnMessage("running in a single-thread mode. This could take a while.")
                threadData.shared.count = 0
                threadData.shared.length = len(threadData.shared.unprocessed)
                numThreads = min(conf.threads, len(threadData.shared.unprocessed))
                logger.info("searching for links with depth %d" % (i + 1))
                runThreads(numThreads, crawlThread)
                clearConsoleLine(True)
                if threadData.shared.deeper:
                    threadData.shared.unprocessed = set(threadData.shared.deeper)
                else:
                    break

        except KeyboardInterrupt:
            warnMsg = "user aborted during crawling. sqlmap "
            warnMsg += "will use partial list"
            logger.warn(warnMsg)

        finally:
            clearConsoleLine(True)

            if not threadData.shared.outputs:
                warnMsg = "no usable links found (with GET parameters)"
                logger.warn(warnMsg)
            else:
                for url in threadData.shared.outputs:
                    kb.targets.add(( url, None, None, None ))
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`#!/usr/bin/env python`

			`"""`
modified homepage address 2012-07-12 21:38:03 +04:00			`Copyright (c) 2006-2012 sqlmap developers (http://sqlmap.org/)`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`See the file 'doc/COPYING' for copying permission`
			`"""`

quick fix for a bug reported by jovon.itwaru@gmail.com 2011-07-11 12:54:39 +04:00			`import httplib`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`import re`
			`import urlparse`
minor update 2011-06-20 18:27:24 +04:00			`import time`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
minor update 2011-06-20 18:27:24 +04:00			`from lib.core.common import clearConsoleLine`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.core.common import dataToStdout`
adding compatibility support for using --crawl and --forms together 2011-10-29 13:32:20 +04:00			`from lib.core.common import findPageForms`
minor update 2011-06-24 23:50:13 +04:00			`from lib.core.common import singleTimeWarnMessage`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.core.data import conf`
			`from lib.core.data import kb`
			`from lib.core.data import logger`
Doing some more style updating (capitalization of exception classes; using _ is enough for private members - __ is used in Python specific methods) 2012-12-06 17:14:19 +04:00			`from lib.core.exception import SqlmapConnectionException`
crawler fix (skip binary files) 2011-06-21 02:41:38 +04:00			`from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.core.threads import getCurrentThreadData`
			`from lib.core.threads import runThreads`
			`from lib.request.connect import Connect as Request`
More work for Issue #66 2012-07-14 19:01:04 +04:00			`from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup`
			`from thirdparty.oset.pyoset import oset`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Introducing 'new style classes' (idea from Pull request #284) 2012-12-06 13:42:53 +04:00			`class Crawler(object):`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`"""`
			`This class defines methods used to perform crawling (command`
			`line option '--crawl'`
			`"""`

changing to: --crawl=CRAWLDEPTH 2011-06-24 09:40:03 +04:00			`def getTargetUrls(self):`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`try:`
			`threadData = getCurrentThreadData()`
			`threadData.shared.outputs = oset()`

			`def crawlThread():`
			`threadData = getCurrentThreadData()`

			`while kb.threadContinue:`
some more refactoring 2012-06-14 17:52:56 +04:00			`with kb.locks.limits:`
			`if threadData.shared.unprocessed:`
			`current = threadData.shared.unprocessed.pop()`
			`else:`
			`break`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
crawler fix (skip binary files) 2011-06-21 02:41:38 +04:00			`content = None`
minor update 2011-06-21 01:47:03 +04:00			`try:`
sorry Bernardo, i hope your mobile is turned off :))) 2011-06-21 02:47:24 +04:00			`if current:`
fix for a bug reported by g@brindi.si (UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 47: ordinal not in range(128)) 2011-06-24 23:24:11 +04:00			`content = Request.getPage(url=current, crawling=True, raise404=False)[0]`
Doing some more style updating (capitalization of exception classes; using _ is enough for private members - __ is used in Python specific methods) 2012-12-06 17:14:19 +04:00			`except SqlmapConnectionException, e:`
minor update 2011-06-21 01:47:03 +04:00			`errMsg = "connection exception detected (%s). skipping " % e`
			`errMsg += "url '%s'" % current`
			`logger.critical(errMsg)`
quick fix for a bug reported by jovon.itwaru@gmail.com 2011-07-11 12:54:39 +04:00			`except httplib.InvalidURL, e:`
			`errMsg = "invalid url detected (%s). skipping " % e`
			`errMsg += "url '%s'" % current`
			`logger.critical(errMsg)`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
			`if not kb.threadContinue:`
			`break`

crawler fix (skip binary files) 2011-06-21 02:41:38 +04:00			`if isinstance(content, unicode):`
minor bug fix 2011-12-20 14:34:28 +04:00			`try:`
			`soup = BeautifulSoup(content)`
			`for tag in soup('a'):`
			`if tag.get("href"):`
			`url = urlparse.urljoin(conf.url, tag.get("href"))`
removing of unused imports together with some general code refactoring 2012-02-22 14:40:11 +04:00
minor bug fix 2011-12-20 14:34:28 +04:00			`# flag to know if we are dealing with the same target host`
Implementation for an Issue #283 2012-12-06 14:57:57 +04:00			`_ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, conf.url)))`
removing of unused imports together with some general code refactoring 2012-02-22 14:40:11 +04:00
minor bug fix 2011-12-20 14:34:28 +04:00			`if conf.scope:`
			`if not re.search(conf.scope, url, re.I):`
			`continue`
Implementation for an Issue #283 2012-12-06 14:57:57 +04:00			`elif not _:`
minor fix for crawler and far less message overlaps in future 2011-06-21 01:18:12 +04:00			`continue`
removing of unused imports together with some general code refactoring 2012-02-22 14:40:11 +04:00
minor bug fix 2011-12-20 14:34:28 +04:00			`if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:`
some more refactoring 2012-06-14 17:52:56 +04:00			`with kb.locks.outputs:`
			`threadData.shared.deeper.add(url)`
			`if re.search(r"(.*?)\?(.+)", url):`
			`threadData.shared.outputs.add(url)`
minor bug fix 2011-12-20 14:34:28 +04:00			`except UnicodeEncodeError: # for non-HTML files`
			`pass`
			`finally:`
			`if conf.forms:`
			`findPageForms(content, current, False, True)`
adding compatibility support for using --crawl and --forms together 2011-10-29 13:32:20 +04:00
minor update 2011-06-20 18:27:24 +04:00			`if conf.verbose in (1, 2):`
			`threadData.shared.count += 1`
			`status = '%d/%d links visited (%d%s)' % (threadData.shared.count, threadData.shared.length, round(100.0*threadData.shared.count/threadData.shared.length), '%')`
			`dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)`

adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`threadData.shared.deeper = set()`
			`threadData.shared.unprocessed = set([conf.url])`

minor update 2011-06-20 15:46:23 +04:00			`logger.info("starting crawler")`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
changing to: --crawl=CRAWLDEPTH 2011-06-24 09:40:03 +04:00			`for i in xrange(conf.crawlDepth):`
minor update 2011-06-24 23:50:13 +04:00			`if i > 0 and conf.threads == 1:`
			`singleTimeWarnMessage("running in a single-thread mode. This could take a while.")`
minor update 2011-06-20 18:27:24 +04:00			`threadData.shared.count = 0`
			`threadData.shared.length = len(threadData.shared.unprocessed)`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`numThreads = min(conf.threads, len(threadData.shared.unprocessed))`
minor update 2011-06-20 18:27:24 +04:00			`logger.info("searching for links with depth %d" % (i + 1))`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`runThreads(numThreads, crawlThread)`
minor update 2011-06-20 18:27:24 +04:00			`clearConsoleLine(True)`
really minor update 2011-06-21 01:57:53 +04:00			`if threadData.shared.deeper:`
			`threadData.shared.unprocessed = set(threadData.shared.deeper)`
			`else:`
			`break`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
			`except KeyboardInterrupt:`
			`warnMsg = "user aborted during crawling. sqlmap "`
			`warnMsg += "will use partial list"`
			`logger.warn(warnMsg)`

			`finally:`
minor update 2011-06-20 18:27:24 +04:00			`clearConsoleLine(True)`

minor update 2011-06-20 17:53:39 +04:00			`if not threadData.shared.outputs:`
			`warnMsg = "no usable links found (with GET parameters)"`
			`logger.warn(warnMsg)`
			`else:`
			`for url in threadData.shared.outputs:`
Code refactoring (epecially Google search code) 2012-10-30 21:38:10 +04:00			`kb.targets.add(( url, None, None, None ))`