baidu search engine support

This commit is contained in:
5a43 2016-09-14 21:40:44 +08:00
parent 921a53e314
commit 83f0735c84
2 changed files with 110 additions and 45 deletions

View File

@ -94,6 +94,9 @@ DUCKDUCKGO_REGEX = r'"u":"([^"]+)'
# Regular expression used for extracting results from Disconnect Search
DISCONNECT_SEARCH_REGEX = r'<p class="url wrapword">([^<]+)</p>'
# Regular expression used for extracting results from Baidu Search
BAIDU_SEARCH_REGEX = r'<a\s+[\w\W]+?\s+href\s*=\s*"(.+)?"\s+target\s*=\s*"_blank"\s*>'
# Dummy user agent for search (if default one returns different results)
DUMMY_SEARCH_USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:40.0) Gecko/20100101 Firefox/40.0"

View File

@ -30,12 +30,59 @@ from lib.core.settings import DUMMY_SEARCH_USER_AGENT
from lib.core.settings import DUCKDUCKGO_REGEX
from lib.core.settings import DISCONNECT_SEARCH_REGEX
from lib.core.settings import GOOGLE_REGEX
from lib.core.settings import BAIDU_SEARCH_REGEX
from lib.core.settings import HTTP_ACCEPT_ENCODING_HEADER_VALUE
from lib.core.settings import UNICODE_ENCODING
from lib.request.basic import decodePage
from thirdparty.socks import socks
def _remove_duplicate(links):
if not links:
return []
tmplinks = map(lambda url: url[:url.find("?")], links)
tmplinks = set(tmplinks)
ret = []
for link in links:
for tmplink in tmplinks:
if link.lower().find(tmplink.lower()) == 0:
ret.append(link)
tmplinks.remove(tmplink)
break
return ret
def _locate_real_url_from_baidu_results(links):
retVal = []
for link in links:
try:
req = urllib2.Request(link)
conn = urllib2.urlopen(req, timeout=conf.timeout)
page = conn.read()
responseHeaders = conn.info()
page = decodePage(page, responseHeaders.get("Content-Encoding"), responseHeaders.get("Content-Type"))
if page:
url = conn.geturl()
if link != url:
logger.info(url)
retVal.append(url)
else:
# baidu sometimes will just use Javascript to redirect the page
# rather than responding a 302 HTTP code.
m = re.search('<script>window\.location\.replace\("(?P<url>.+?)"\)', page, re.I)
if m:
url = m.group('url')
logger.info(url)
retVal.append(url)
except Exception, e:
logger.debug(e.message)
# remove duplicate links to make the scan more efficient, because baidu often has similar links in its results.
return _remove_duplicate(retVal)
def _search(dork):
"""
This method performs the effective search on Google providing
@ -46,20 +93,24 @@ def _search(dork):
return None
headers = {}
retVal = []
failed = False
headers[HTTP_HEADER.USER_AGENT] = dict(conf.httpHeaders).get(HTTP_HEADER.USER_AGENT, DUMMY_SEARCH_USER_AGENT)
headers[HTTP_HEADER.ACCEPT_ENCODING] = HTTP_ACCEPT_ENCODING_HEADER_VALUE
try:
req = urllib2.Request("https://www.google.com/ncr", headers=headers)
conn = urllib2.urlopen(req)
conn = urllib2.urlopen(req, timeout=conf.timeout)
except Exception, ex:
errMsg = "unable to connect to Google ('%s')" % getSafeExString(ex)
raise SqlmapConnectionException(errMsg)
logger.warn(errMsg)
failed = True
gpage = conf.googlePage if conf.googlePage > 1 else 1
logger.info("using search result page #%d" % gpage)
if not failed:
url = "https://www.google.com/search?"
url += "q=%s&" % urlencode(dork, convall=True)
url += "num=100&hl=en&complete=0&safe=off&filter=0&btnG=Search"
@ -110,7 +161,8 @@ def _search(dork):
message = "no usable links found. What do you want to do?"
message += "\n[1] (re)try with DuckDuckGo (default)"
message += "\n[2] (re)try with Disconnect Search"
message += "\n[3] quit"
message += "\n[3] (re)try with Baidu Search"
message += "\n[Q] quit"
choice = readInput(message, default="1").strip().upper()
if choice == "Q":
@ -123,6 +175,12 @@ def _search(dork):
url += "&nextDDG=%s" % urlencode("/search?q=%s&setmkt=en-US&setplang=en-us&setlang=en-us&first=%d&FORM=PORE" % (urlencode(dork, convall=True), (gpage - 1) * 10), convall=True)
url += "&sa=N&showIcons=false&filterIcons=none&js_enabled=1"
regex = DISCONNECT_SEARCH_REGEX
elif choice == "3":
url = "http://www.baidu.com"
url += "/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&rn=50"
url += "&wd=%s" % urlencode(dork, convall=True)
url += "&pn=%d" % ((gpage - 1) * 10)
regex = BAIDU_SEARCH_REGEX
else:
url = "https://duckduckgo.com/d.js?"
url += "q=%s&p=%d&s=100" % (urlencode(dork, convall=True), gpage)
@ -162,6 +220,10 @@ def _search(dork):
errMsg = "unable to connect"
raise SqlmapConnectionException(errMsg)
if choice == "3":
retVal = [urllib.unquote(match.group(1)) for match in re.finditer(regex, page, re.I)]
retVal = _locate_real_url_from_baidu_results(retVal)
else:
retVal = [urllib.unquote(match.group(1)) for match in re.finditer(regex, page, re.I | re.S)]
return retVal