diff --git a/extra/beautifulsoup/beautifulsoup.py b/extra/beautifulsoup/beautifulsoup.py index d43d3a71d..4b17b853d 100644 --- a/extra/beautifulsoup/beautifulsoup.py +++ b/extra/beautifulsoup/beautifulsoup.py @@ -549,9 +549,6 @@ class Tag(PageElement): val)) self.attrs = map(convert, self.attrs) - # Reference: http://bytes.com/topic/python/answers/552874-py-2-5-bug-sgmllib - SGMLParser.convert_codepoint = lambda self, codepoint: unichr(codepoint) - def getString(self): if (len(self.contents) == 1 and isinstance(self.contents[0], NavigableString)): diff --git a/lib/core/settings.py b/lib/core/settings.py index abdc26d46..3064d91ec 100644 --- a/lib/core/settings.py +++ b/lib/core/settings.py @@ -354,3 +354,6 @@ HTML_TITLE_REGEX = "(?P<result>[^<]+)" # Chars used to quickly distinguish if the user provided tainted parameter values DUMMY_SQL_INJECTION_CHARS = ";()\"'" + +# Extensions skipped by crawler +CRAWL_EXCLUDE_EXTENSIONS = ("gif","jpg","jar","tif","bmp","war","ear","mpg","wmv","mpeg","scm","iso","dmp","dll","cab","so","avi","bin","exe","iso","tar","png","pdf","ps","mp3","zip","rar","gz") diff --git a/lib/utils/crawler.py b/lib/utils/crawler.py index 383688219..169f4035d 100644 --- a/lib/utils/crawler.py +++ b/lib/utils/crawler.py @@ -18,6 +18,7 @@ from lib.core.data import conf from lib.core.data import kb from lib.core.data import logger from lib.core.exception import sqlmapConnectionException +from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS from lib.core.threads import getCurrentThreadData from lib.core.threads import runThreads from lib.request.connect import Connect as Request @@ -51,10 +52,11 @@ class Crawler: kb.locks.limits.release() break + content = None try: - content = Request.getPage(url=current, raise404=False)[0] + if current.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS: + content = Request.getPage(url=current, raise404=False)[0] except sqlmapConnectionException, e: - content = None errMsg = "connection exception detected (%s). skipping " % e errMsg += "url '%s'" % current logger.critical(errMsg) @@ -62,7 +64,7 @@ class Crawler: if not kb.threadContinue: break - if content: + if isinstance(content, unicode): soup = BeautifulSoup(content) for tag in soup('a'): if tag.get("href"):