crawler fix (skip binary files)

This commit is contained in:
Miroslav Stampar 2011-06-20 22:41:38 +00:00
parent 20bb1a685b
commit 2a4a284a29
3 changed files with 8 additions and 6 deletions

View File

@ -549,9 +549,6 @@ class Tag(PageElement):
val)) val))
self.attrs = map(convert, self.attrs) self.attrs = map(convert, self.attrs)
# Reference: http://bytes.com/topic/python/answers/552874-py-2-5-bug-sgmllib
SGMLParser.convert_codepoint = lambda self, codepoint: unichr(codepoint)
def getString(self): def getString(self):
if (len(self.contents) == 1 if (len(self.contents) == 1
and isinstance(self.contents[0], NavigableString)): and isinstance(self.contents[0], NavigableString)):

View File

@ -354,3 +354,6 @@ HTML_TITLE_REGEX = "<title>(?P<result>[^<]+)</title>"
# Chars used to quickly distinguish if the user provided tainted parameter values # Chars used to quickly distinguish if the user provided tainted parameter values
DUMMY_SQL_INJECTION_CHARS = ";()\"'" DUMMY_SQL_INJECTION_CHARS = ";()\"'"
# Extensions skipped by crawler
CRAWL_EXCLUDE_EXTENSIONS = ("gif","jpg","jar","tif","bmp","war","ear","mpg","wmv","mpeg","scm","iso","dmp","dll","cab","so","avi","bin","exe","iso","tar","png","pdf","ps","mp3","zip","rar","gz")

View File

@ -18,6 +18,7 @@ from lib.core.data import conf
from lib.core.data import kb from lib.core.data import kb
from lib.core.data import logger from lib.core.data import logger
from lib.core.exception import sqlmapConnectionException from lib.core.exception import sqlmapConnectionException
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
from lib.core.threads import getCurrentThreadData from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads from lib.core.threads import runThreads
from lib.request.connect import Connect as Request from lib.request.connect import Connect as Request
@ -51,10 +52,11 @@ class Crawler:
kb.locks.limits.release() kb.locks.limits.release()
break break
content = None
try: try:
if current.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
content = Request.getPage(url=current, raise404=False)[0] content = Request.getPage(url=current, raise404=False)[0]
except sqlmapConnectionException, e: except sqlmapConnectionException, e:
content = None
errMsg = "connection exception detected (%s). skipping " % e errMsg = "connection exception detected (%s). skipping " % e
errMsg += "url '%s'" % current errMsg += "url '%s'" % current
logger.critical(errMsg) logger.critical(errMsg)
@ -62,7 +64,7 @@ class Crawler:
if not kb.threadContinue: if not kb.threadContinue:
break break
if content: if isinstance(content, unicode):
soup = BeautifulSoup(content) soup = BeautifulSoup(content)
for tag in soup('a'): for tag in soup('a'):
if tag.get("href"): if tag.get("href"):