sqlmap/lib/utils/crawler.py

#!/usr/bin/env python

"""
Copyright (c) 2006-2013 sqlmap developers (http://sqlmap.org/)
See the file 'doc/COPYING' for copying permission
"""

import httplib
import re
import urlparse
import time

from lib.core.common import clearConsoleLine
from lib.core.common import dataToStdout
from lib.core.common import findPageForms
from lib.core.common import singleTimeWarnMessage
from lib.core.data import conf
from lib.core.data import kb
from lib.core.data import logger
from lib.core.exception import SqlmapConnectionException
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads
from lib.request.connect import Connect as Request
from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
from thirdparty.oset.pyoset import oset

def crawl(target):
    try:
        threadData = getCurrentThreadData()
        threadData.shared.value = oset()

        def crawlThread():
            threadData = getCurrentThreadData()

            while kb.threadContinue:
                with kb.locks.limit:
                    if threadData.shared.unprocessed:
                        current = threadData.shared.unprocessed.pop()
                    else:
                        break

                content = None
                try:
                    if current:
                        content = Request.getPage(url=current, crawling=True, raise404=False)[0]
                except SqlmapConnectionException, e:
                    errMsg = "connection exception detected (%s). skipping " % e
                    errMsg += "url '%s'" % current
                    logger.critical(errMsg)
                except httplib.InvalidURL, e:
                    errMsg = "invalid url detected (%s). skipping " % e
                    errMsg += "url '%s'" % current
                    logger.critical(errMsg)

                if not kb.threadContinue:
                    break

                if isinstance(content, unicode):
                    try:
                        match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
                        if match:
                            content = "<html>%s</html>" % match.group(1)

                        soup = BeautifulSoup(content)
                        tags = soup('a')

                        if not tags:
                            tags = re.finditer(r'(?si)<a[^>]+href="(?P<href>[^>"]+)"', content)

                        for tag in tags:
                            href = tag.get("href") if hasattr(tag, "get") else tag.group("href")

                            if href:
                                url = urlparse.urljoin(target, href)

                                # flag to know if we are dealing with the same target host
                                _ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, target)))

                                if conf.scope:
                                    if not re.search(conf.scope, url, re.I):
                                        continue
                                elif not _:
                                    continue

                                if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
                                    with kb.locks.value:
                                        threadData.shared.deeper.add(url)
                                        if re.search(r"(.*?)\?(.+)", url):
                                            threadData.shared.value.add(url)
                    except UnicodeEncodeError:  # for non-HTML files
                        pass
                    finally:
                        if conf.forms:
                            findPageForms(content, current, False, True)

                if conf.verbose in (1, 2):
                    threadData.shared.count += 1
                    status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
                    dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)

        threadData.shared.deeper = set()
        threadData.shared.unprocessed = set([target])

        logger.info("starting crawler")

        for i in xrange(conf.crawlDepth):
            if i > 0 and conf.threads == 1:
                singleTimeWarnMessage("running in a single-thread mode. This could take a while.")
            threadData.shared.count = 0
            threadData.shared.length = len(threadData.shared.unprocessed)
            numThreads = min(conf.threads, len(threadData.shared.unprocessed))
            logger.info("searching for links with depth %d" % (i + 1))
            runThreads(numThreads, crawlThread)
            clearConsoleLine(True)
            if threadData.shared.deeper:
                threadData.shared.unprocessed = set(threadData.shared.deeper)
            else:
                break

    except KeyboardInterrupt:
        warnMsg = "user aborted during crawling. sqlmap "
        warnMsg += "will use partial list"
        logger.warn(warnMsg)

    finally:
        clearConsoleLine(True)

        if not threadData.shared.value:
            warnMsg = "no usable links found (with GET parameters)"
            logger.warn(warnMsg)
        else:
            for url in threadData.shared.value:
                kb.targets.add((url, None, None, None))
reverted a previous commit as not all distributions create a link file /usr/bin/python2 to the Python interpreter 2013-02-14 15:32:17 +04:00			`#!/usr/bin/env python`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
			`"""`
updated copyright 2013-01-18 18:07:51 +04:00			`Copyright (c) 2006-2013 sqlmap developers (http://sqlmap.org/)`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`See the file 'doc/COPYING' for copying permission`
			`"""`

quick fix for a bug reported by jovon.itwaru@gmail.com 2011-07-11 12:54:39 +04:00			`import httplib`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`import re`
			`import urlparse`
minor update 2011-06-20 18:27:24 +04:00			`import time`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
minor update 2011-06-20 18:27:24 +04:00			`from lib.core.common import clearConsoleLine`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.core.common import dataToStdout`
adding compatibility support for using --crawl and --forms together 2011-10-29 13:32:20 +04:00			`from lib.core.common import findPageForms`
minor update 2011-06-24 23:50:13 +04:00			`from lib.core.common import singleTimeWarnMessage`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.core.data import conf`
			`from lib.core.data import kb`
			`from lib.core.data import logger`
Doing some more style updating (capitalization of exception classes; using _ is enough for private members - __ is used in Python specific methods) 2012-12-06 17:14:19 +04:00			`from lib.core.exception import SqlmapConnectionException`
crawler fix (skip binary files) 2011-06-21 02:41:38 +04:00			`from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`from lib.core.threads import getCurrentThreadData`
			`from lib.core.threads import runThreads`
			`from lib.request.connect import Connect as Request`
More work for Issue #66 2012-07-14 19:01:04 +04:00			`from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup`
			`from thirdparty.oset.pyoset import oset`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`def crawl(target):`
			`try:`
			`threadData = getCurrentThreadData()`
			`threadData.shared.value = oset()`

			`def crawlThread():`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00			`threadData = getCurrentThreadData()`

Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`while kb.threadContinue:`
			`with kb.locks.limit:`
			`if threadData.shared.unprocessed:`
			`current = threadData.shared.unprocessed.pop()`
			`else:`
			`break`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`content = None`
			`try:`
			`if current:`
			`content = Request.getPage(url=current, crawling=True, raise404=False)[0]`
			`except SqlmapConnectionException, e:`
			`errMsg = "connection exception detected (%s). skipping " % e`
			`errMsg += "url '%s'" % current`
			`logger.critical(errMsg)`
			`except httplib.InvalidURL, e:`
			`errMsg = "invalid url detected (%s). skipping " % e`
			`errMsg += "url '%s'" % current`
			`logger.critical(errMsg)`

			`if not kb.threadContinue:`
			`break`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if isinstance(content, unicode):`
minor update 2011-06-21 01:47:03 +04:00			`try:`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)`
			`if match:`
			`content = "<html>%s</html>" % match.group(1)`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`soup = BeautifulSoup(content)`
			`tags = soup('a')`
Fix for an Issue #324 (crawling when HTML is not well-formed) 2012-12-27 23:55:37 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if not tags:`
			`tags = re.finditer(r'(?si)<a[^>]+href="(?P<href>[^>"]+)"', content)`
Fix for an Issue #324 (crawling when HTML is not well-formed) 2012-12-27 23:55:37 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`for tag in tags:`
			`href = tag.get("href") if hasattr(tag, "get") else tag.group("href")`
Fix for an Issue #324 (crawling when HTML is not well-formed) 2012-12-27 23:55:37 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if href:`
			`url = urlparse.urljoin(target, href)`
Fix for an Issue #324 (crawling when HTML is not well-formed) 2012-12-27 23:55:37 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`# flag to know if we are dealing with the same target host`
			`_ = reduce(lambda x, y: x == y, map(lambda x: urlparse.urlparse(x).netloc.split(':')[0], (url, target)))`
removing of unused imports together with some general code refactoring 2012-02-22 14:40:11 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if conf.scope:`
			`if not re.search(conf.scope, url, re.I):`
minor fix for crawler and far less message overlaps in future 2011-06-21 01:18:12 +04:00			`continue`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`elif not _:`
			`continue`

			`if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:`
			`with kb.locks.value:`
			`threadData.shared.deeper.add(url)`
			`if re.search(r"(.*?)\?(.+)", url):`
			`threadData.shared.value.add(url)`
Some PEP8 related style cleaning 2013-01-10 16:18:44 +04:00			`except UnicodeEncodeError: # for non-HTML files`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`pass`
			`finally:`
			`if conf.forms:`
			`findPageForms(content, current, False, True)`

			`if conf.verbose in (1, 2):`
			`threadData.shared.count += 1`
Minor style update 2013-01-09 19:10:26 +04:00			`status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))`
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)`

			`threadData.shared.deeper = set()`
			`threadData.shared.unprocessed = set([target])`

			`logger.info("starting crawler")`

			`for i in xrange(conf.crawlDepth):`
			`if i > 0 and conf.threads == 1:`
			`singleTimeWarnMessage("running in a single-thread mode. This could take a while.")`
			`threadData.shared.count = 0`
			`threadData.shared.length = len(threadData.shared.unprocessed)`
			`numThreads = min(conf.threads, len(threadData.shared.unprocessed))`
			`logger.info("searching for links with depth %d" % (i + 1))`
			`runThreads(numThreads, crawlThread)`
			`clearConsoleLine(True)`
			`if threadData.shared.deeper:`
			`threadData.shared.unprocessed = set(threadData.shared.deeper)`
			`else:`
			`break`
removing of unused imports together with some general code refactoring 2012-02-22 14:40:11 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`except KeyboardInterrupt:`
			`warnMsg = "user aborted during crawling. sqlmap "`
			`warnMsg += "will use partial list"`
			`logger.warn(warnMsg)`
adding Beautifulsoup (BSD) into extras; adding --crawl to options 2011-06-20 15:32:30 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`finally:`
			`clearConsoleLine(True)`
minor update 2011-06-20 18:27:24 +04:00
Patch for an Issue #169 2013-01-09 18:22:21 +04:00			`if not threadData.shared.value:`
			`warnMsg = "no usable links found (with GET parameters)"`
			`logger.warn(warnMsg)`
			`else:`
			`for url in threadData.shared.value:`
			`kb.targets.add((url, None, None, None))`