Minor refactoring and update of sqlharvest.py

2025-07-16 03:02:20 +03:00 · 2012-08-21 13:37:16 +02:00 · 2012-08-21 13:37:16 +02:00 · 80120e849f
commit 80120e849f
parent 1bcf5a6b88
1 changed files with 41 additions and 49 deletions
--- a/extra/sqlharvest/sqlharvest.py
+++ b/extra/sqlharvest/sqlharvest.py
@ -15,66 +15,62 @@ import ConfigParser

 from operator import itemgetter

+TIMEOUT = 10
+CONFIG_FILE = 'sqlharvest.cfg'
+TABLES_FILE = 'tables.txt'
+USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; AskTB5.3)'
+SEARCH_URL = 'http://www.google.com/m?source=mobileproducts&dc=gorganic'
+MAX_FILE_SIZE = 2 * 1024 * 1024 # if a result (.sql) file for downloading is more than 2MB in size just skip it
+QUERY = 'CREATE TABLE ext:sql'
+REGEX_URLS = r';u=([^"]+?)&amp;q='
+REGEX_RESULT = r'(?i)CREATE TABLE\s*(/\*.*\*/)?\s*(IF NOT EXISTS)?\s*(?P<result>[^\(;]+)'
+
 def main():
-
-    TIMEOUT = 10
-    CONFIG_FILE = 'sqlharvest.cfg'
-    TABLES_FILE = 'tables.txt'
-    USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; AskTB5.3)'
-    SEARCH_URL = 'http://www.google.com/m?source=mobileproducts&dc=gorganic'
-    MAX_FILE_SIZE = 2*1024*1024 # if a result (.sql) file for downloading is more than 2MB in size just skip it
-    QUERY = 'CREATE TABLE ext:sql'
-    REGEX_URLS = r';u=([^"]+)'
-    REGEX_RESULT = r'CREATE TABLE\s*(/\*.*\*/)?\s*(IF NOT EXISTS)?\s*(?P<result>[^\(;]+)'
-
    tables = dict()
-    refiles = re.compile(REGEX_URLS)
-    retables = re.compile(REGEX_RESULT, re.I)
-
    cookies = cookielib.CookieJar()
    cookie_processor = urllib2.HTTPCookieProcessor(cookies)
    opener = urllib2.build_opener(cookie_processor)
-    opener.addheaders = [('User-Agent', USER_AGENT)]
+    opener.addheaders = [("User-Agent", USER_AGENT)]

    conn = opener.open(SEARCH_URL)
    page = conn.read() #set initial cookie values

    config = ConfigParser.ConfigParser()
    config.read(CONFIG_FILE)
-    if not config.has_section('options'):
-        config.add_section('options')

-    if not config.has_option('options',  'index'):
-        config.set('options',  'index', '0')
+    if not config.has_section("options"):
+        config.add_section("options")
+    if not config.has_option("options", "index"):
+        config.set("options", "index", "0")

-    i = int(config.get('options',  'index'))
+    i = int(config.get("options",  "index"))

    try:
-        f = open(TABLES_FILE, 'r')
-        for line in f.xreadlines():
-            if len(line) > 0 and ',' in line:
-                temp = line.split(',')
-                tables[temp[0]] = int(temp[1])
-        f.close()
+        with open(TABLES_FILE, 'r') as f:
+            for line in f.xreadlines():
+                if len(line) > 0 and ',' in line:
+                    temp = line.split(',')
+                    tables[temp[0]] = int(temp[1])
    except:
        pass

    socket.setdefaulttimeout(TIMEOUT)

-    files, oldFiles = None, None
+    files, old_files = None, None
    try:
        while True:
            abort = False
-            oldFiles = files
+            old_files = files
            files = []

            try:
-                conn = opener.open('%s&q=%s&start=%d&sa=N' % (SEARCH_URL, QUERY.replace(' ', '+'), i*10))
+                conn = opener.open("%s&q=%s&start=%d&sa=N" % (SEARCH_URL, QUERY.replace(' ', '+'), i * 10))
                page = conn.read()
-                for match in refiles.finditer(page):
+                for match in re.finditer(REGEX_URLS, page):
                    files.append(urllib.unquote(match.group(1)))
-                    if len(files) >= 10: break
-                abort = (files == oldFiles)
+                    if len(files) >= 10:
+                        break
+                abort = (files == old_files)

            except KeyboardInterrupt:
                raise
@ -91,23 +87,24 @@ def main():

            for sqlfile in files:
                print sqlfile
+
                try:
                    req = urllib2.Request(sqlfile)
                    response = urllib2.urlopen(req)

-                    if response.headers.has_key('Content-Length'):
-                        if int(response.headers.get('Content-Length')) > MAX_FILE_SIZE:
+                    if response.headers.has_key("Content-Length"):
+                        if int(response.headers.get("Content-Length")) > MAX_FILE_SIZE:
                            continue

                    page = response.read()
                    found = False
                    counter = 0

-                    for match in retables.finditer(page):
+                    for match in re.finditer(REGEX_RESULT, page):
                        counter += 1
-                        table = match.group("result").strip().strip("`").strip("\"").strip("'").replace('"."', ".").replace("].[", ".").strip('[').strip(']')
+                        table = match.group("result").strip().strip("`\"'").replace('"."', ".").replace("].[", ".").strip('[]')

-                        if table and '>' not in table and '<' not in table and '--' not in table and ' ' not in table:
+                        if table and not any(_ in table for _ in ('>', '<', '--', ' ')):
                            found = True
                            sys.stdout.write('*')

@ -131,19 +128,14 @@ def main():
        pass

    finally:
-        f = open(TABLES_FILE, 'w+')
+        with open(TABLES_FILE, 'w+') as f:
+            tables = sorted(tables.items(), key=itemgetter(1), reverse=True)
+            for table, count in tables:
+                f.write("%s,%d\n" % (table, count))

-        tables = sorted(tables.items(), key=itemgetter(1), reverse=True)
-
-        for table, count in tables:
-            f.write("%s,%d\n" % (table, count))
-
-        f.close()
-        config.set('options',  'index', str(i+1))
-
-        f = open(CONFIG_FILE, 'w+')
-        config.write(f)
-        f.close()
+        config.set("options", "index", str(i + 1))
+        with open(CONFIG_FILE, 'w+') as f:
+            config.write(f)

 if __name__ == "__main__":
    main()