diff --git a/lib/core/settings.py b/lib/core/settings.py index c0f012911..c34e3bfb7 100755 --- a/lib/core/settings.py +++ b/lib/core/settings.py @@ -102,6 +102,9 @@ DUCKDUCKGO_REGEX = r'"u":"([^"]+)' # Regular expression used for extracting results from Disconnect Search DISCONNECT_SEARCH_REGEX = r'
([^<]+)
' +# Regular expression used for extracting results from Baidu Search +BAIDU_SEARCH_REGEX = r'' + # Dummy user agent for search (if default one returns different results) DUMMY_SEARCH_USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0" diff --git a/lib/utils/search.py b/lib/utils/search.py index 98dba4b50..03caa2cbd 100644 --- a/lib/utils/search.py +++ b/lib/utils/search.py @@ -30,12 +30,59 @@ from lib.core.settings import DUMMY_SEARCH_USER_AGENT from lib.core.settings import DUCKDUCKGO_REGEX from lib.core.settings import DISCONNECT_SEARCH_REGEX from lib.core.settings import GOOGLE_REGEX +from lib.core.settings import BAIDU_SEARCH_REGEX from lib.core.settings import HTTP_ACCEPT_ENCODING_HEADER_VALUE from lib.core.settings import UNICODE_ENCODING from lib.request.basic import decodePage from thirdparty.socks import socks +def _remove_duplicate(links): + if not links: + return [] + + tmplinks = map(lambda url: url[:url.find("?")], links) + tmplinks = set(tmplinks) + ret = [] + for link in links: + for tmplink in tmplinks: + if link.lower().find(tmplink.lower()) == 0: + ret.append(link) + tmplinks.remove(tmplink) + break + return ret + + +def _locate_real_url_from_baidu_results(links): + retVal = [] + + for link in links: + try: + req = urllib2.Request(link) + conn = urllib2.urlopen(req, timeout=conf.timeout) + page = conn.read() + responseHeaders = conn.info() + page = decodePage(page, responseHeaders.get("Content-Encoding"), responseHeaders.get("Content-Type")) + + if page: + url = conn.geturl() + if link != url: + logger.info(url) + retVal.append(url) + else: + # baidu sometimes will just use Javascript to redirect the page + # rather than responding a 302 HTTP code. + m = re.search('