From 83f0735c845a4049cb5af32695ee50a4bb60ce04 Mon Sep 17 00:00:00 2001 From: 5a43 Date: Wed, 14 Sep 2016 21:40:44 +0800 Subject: [PATCH] baidu search engine support --- lib/core/settings.py | 3 + lib/utils/search.py | 152 ++++++++++++++++++++++++++++++------------- 2 files changed, 110 insertions(+), 45 deletions(-) diff --git a/lib/core/settings.py b/lib/core/settings.py index fdd451e2f..59762690b 100644 --- a/lib/core/settings.py +++ b/lib/core/settings.py @@ -94,6 +94,9 @@ DUCKDUCKGO_REGEX = r'"u":"([^"]+)' # Regular expression used for extracting results from Disconnect Search DISCONNECT_SEARCH_REGEX = r'

([^<]+)

' +# Regular expression used for extracting results from Baidu Search +BAIDU_SEARCH_REGEX = r'' + # Dummy user agent for search (if default one returns different results) DUMMY_SEARCH_USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:40.0) Gecko/20100101 Firefox/40.0" diff --git a/lib/utils/search.py b/lib/utils/search.py index 1f54deac1..6b51c38e7 100644 --- a/lib/utils/search.py +++ b/lib/utils/search.py @@ -30,12 +30,59 @@ from lib.core.settings import DUMMY_SEARCH_USER_AGENT from lib.core.settings import DUCKDUCKGO_REGEX from lib.core.settings import DISCONNECT_SEARCH_REGEX from lib.core.settings import GOOGLE_REGEX +from lib.core.settings import BAIDU_SEARCH_REGEX from lib.core.settings import HTTP_ACCEPT_ENCODING_HEADER_VALUE from lib.core.settings import UNICODE_ENCODING from lib.request.basic import decodePage from thirdparty.socks import socks +def _remove_duplicate(links): + if not links: + return [] + + tmplinks = map(lambda url: url[:url.find("?")], links) + tmplinks = set(tmplinks) + ret = [] + for link in links: + for tmplink in tmplinks: + if link.lower().find(tmplink.lower()) == 0: + ret.append(link) + tmplinks.remove(tmplink) + break + return ret + + +def _locate_real_url_from_baidu_results(links): + retVal = [] + + for link in links: + try: + req = urllib2.Request(link) + conn = urllib2.urlopen(req, timeout=conf.timeout) + page = conn.read() + responseHeaders = conn.info() + page = decodePage(page, responseHeaders.get("Content-Encoding"), responseHeaders.get("Content-Type")) + + if page: + url = conn.geturl() + if link != url: + logger.info(url) + retVal.append(url) + else: + # baidu sometimes will just use Javascript to redirect the page + # rather than responding a 302 HTTP code. + m = re.search('