baidu search engine support

2025-07-30 01:50:01 +03:00 · 2016-09-14 21:40:44 +08:00 · 2016-09-14 21:40:44 +08:00 · 83f0735c84
commit 83f0735c84
parent 921a53e314
2 changed files with 110 additions and 45 deletions
--- a/lib/core/settings.py
+++ b/lib/core/settings.py
@ -94,6 +94,9 @@ DUCKDUCKGO_REGEX = r'"u":"([^"]+)'
 # Regular expression used for extracting results from Disconnect Search
 DISCONNECT_SEARCH_REGEX = r'<p class="url wrapword">([^<]+)</p>'

+# Regular expression used for extracting results from Baidu Search
+BAIDU_SEARCH_REGEX = r'<a\s+[\w\W]+?\s+href\s*=\s*"(.+)?"\s+target\s*=\s*"_blank"\s*>'
+
 # Dummy user agent for search (if default one returns different results)
 DUMMY_SEARCH_USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:40.0) Gecko/20100101 Firefox/40.0"

--- a/lib/utils/search.py
+++ b/lib/utils/search.py
@ -30,12 +30,59 @@ from lib.core.settings import DUMMY_SEARCH_USER_AGENT
 from lib.core.settings import DUCKDUCKGO_REGEX
 from lib.core.settings import DISCONNECT_SEARCH_REGEX
 from lib.core.settings import GOOGLE_REGEX
+from lib.core.settings import BAIDU_SEARCH_REGEX
 from lib.core.settings import HTTP_ACCEPT_ENCODING_HEADER_VALUE
 from lib.core.settings import UNICODE_ENCODING
 from lib.request.basic import decodePage
 from thirdparty.socks import socks


+def _remove_duplicate(links):
+    if not links:
+        return []
+
+    tmplinks = map(lambda url: url[:url.find("?")], links)
+    tmplinks = set(tmplinks)
+    ret = []
+    for link in links:
+        for tmplink in tmplinks:
+            if link.lower().find(tmplink.lower()) == 0:
+                ret.append(link)
+                tmplinks.remove(tmplink)
+                break
+    return ret
+
+
+def _locate_real_url_from_baidu_results(links):
+    retVal = []
+
+    for link in links:
+        try:
+            req = urllib2.Request(link)
+            conn = urllib2.urlopen(req, timeout=conf.timeout)
+            page = conn.read()
+            responseHeaders = conn.info()
+            page = decodePage(page, responseHeaders.get("Content-Encoding"), responseHeaders.get("Content-Type"))
+
+            if page:
+                url = conn.geturl()
+                if link != url:
+                    logger.info(url)
+                    retVal.append(url)
+                else:
+                    # baidu sometimes will just use Javascript to redirect the page
+                    # rather than responding a 302 HTTP code.
+                    m = re.search('<script>window\.location\.replace\("(?P<url>.+?)"\)', page, re.I)
+                    if m:
+                        url = m.group('url')
+                        logger.info(url)
+                        retVal.append(url)
+        except Exception, e:
+            logger.debug(e.message)
+    # remove duplicate links to make the scan more efficient, because baidu often has similar links in its results.
+    return _remove_duplicate(retVal)
+
+
 def _search(dork):
    """
    This method performs the effective search on Google providing
@ -46,20 +93,24 @@ def _search(dork):
        return None

    headers = {}
+    retVal = []
+    failed = False

    headers[HTTP_HEADER.USER_AGENT] = dict(conf.httpHeaders).get(HTTP_HEADER.USER_AGENT, DUMMY_SEARCH_USER_AGENT)
    headers[HTTP_HEADER.ACCEPT_ENCODING] = HTTP_ACCEPT_ENCODING_HEADER_VALUE

    try:
        req = urllib2.Request("https://www.google.com/ncr", headers=headers)
-        conn = urllib2.urlopen(req)
+        conn = urllib2.urlopen(req, timeout=conf.timeout)
    except Exception, ex:
        errMsg = "unable to connect to Google ('%s')" % getSafeExString(ex)
-        raise SqlmapConnectionException(errMsg)
+        logger.warn(errMsg)
+        failed = True

    gpage = conf.googlePage if conf.googlePage > 1 else 1
    logger.info("using search result page #%d" % gpage)

+    if not failed:
        url = "https://www.google.com/search?"
        url += "q=%s&" % urlencode(dork, convall=True)
        url += "num=100&hl=en&complete=0&safe=off&filter=0&btnG=Search"
@ -110,7 +161,8 @@ def _search(dork):
        message = "no usable links found. What do you want to do?"
        message += "\n[1] (re)try with DuckDuckGo (default)"
        message += "\n[2] (re)try with Disconnect Search"
-        message += "\n[3] quit"
+        message += "\n[3] (re)try with Baidu Search"
+        message += "\n[Q] quit"
        choice = readInput(message, default="1").strip().upper()

        if choice == "Q":
@ -123,6 +175,12 @@ def _search(dork):
            url += "&nextDDG=%s" % urlencode("/search?q=%s&setmkt=en-US&setplang=en-us&setlang=en-us&first=%d&FORM=PORE" % (urlencode(dork, convall=True), (gpage - 1) * 10), convall=True)
            url += "&sa=N&showIcons=false&filterIcons=none&js_enabled=1"
            regex = DISCONNECT_SEARCH_REGEX
+        elif choice == "3":
+            url = "http://www.baidu.com"
+            url += "/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&rn=50"
+            url += "&wd=%s" % urlencode(dork, convall=True)
+            url += "&pn=%d" % ((gpage - 1) * 10)
+            regex = BAIDU_SEARCH_REGEX
        else:
            url = "https://duckduckgo.com/d.js?"
            url += "q=%s&p=%d&s=100" % (urlencode(dork, convall=True), gpage)
@ -162,6 +220,10 @@ def _search(dork):
            errMsg = "unable to connect"
            raise SqlmapConnectionException(errMsg)

+        if choice == "3":
+            retVal = [urllib.unquote(match.group(1)) for match in re.finditer(regex, page, re.I)]
+            retVal = _locate_real_url_from_baidu_results(retVal)
+        else:
            retVal = [urllib.unquote(match.group(1)) for match in re.finditer(regex, page, re.I | re.S)]

    return retVal