Remove unused data and download script

This commit is contained in:
Ines Montani 2016-12-08 20:39:49 +01:00
parent 0a6d529104
commit c0c5f31950
2 changed files with 0 additions and 293 deletions

View File

@ -1,47 +0,0 @@
import re
_mw_prepositions = [
'close to',
'down by',
'on the way to',
'on my way to',
'on my way',
'on his way to',
'on his way',
'on her way to',
'on her way',
'on your way to',
'on your way',
'on our way to',
'on our way',
'on their way to',
'on their way',
'along the route from'
]
MW_PREPOSITIONS_RE = re.compile('|'.join(_mw_prepositions), flags=re.IGNORECASE)
TIME_RE = re.compile(
'{colon_digits}|{colon_digits} ?{am_pm}?|{one_two_digits} ?({am_pm})'.format(
colon_digits=r'[0-2]?[0-9]:[0-5][0-9](?::[0-5][0-9])?',
one_two_digits=r'[0-2]?[0-9]',
am_pm=r'[ap]\.?m\.?'))
DATE_RE = re.compile(
'(?:this|last|next|the) (?:week|weekend|{days})'.format(
days='Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday'
))
MONEY_RE = re.compile('\$\d+(?:\.\d+)?|\d+ dollars(?: \d+ cents)?')
DAYS_RE = re.compile('Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday')
REGEXES = [('IN', 'O', MW_PREPOSITIONS_RE), ('CD', 'TIME', TIME_RE),
('NNP', 'DATE', DATE_RE),
('NNP', 'DATE', DAYS_RE), ('CD', 'MONEY', MONEY_RE)]

View File

@ -1,246 +0,0 @@
import os
import time
import io
import math
import re
try:
from urllib.parse import urlparse
from urllib.request import urlopen, Request
from urllib.error import HTTPError
except ImportError:
from urllib2 import urlopen, urlparse, Request, HTTPError
class UnknownContentLengthException(Exception): pass
class InvalidChecksumException(Exception): pass
class UnsupportedHTTPCodeException(Exception): pass
class InvalidOffsetException(Exception): pass
class MissingChecksumHeader(Exception): pass
CHUNK_SIZE = 16 * 1024
class RateSampler(object):
def __init__(self, period=1):
self.rate = None
self.reset = True
self.period = period
def __enter__(self):
if self.reset:
self.reset = False
self.start = time.time()
self.counter = 0
def __exit__(self, type, value, traceback):
elapsed = time.time() - self.start
if elapsed >= self.period:
self.reset = True
self.rate = float(self.counter) / elapsed
def update(self, value):
self.counter += value
def format(self, unit="MB"):
if self.rate is None:
return None
divisor = {'MB': 1048576, 'kB': 1024}
return "%0.2f%s/s" % (self.rate / divisor[unit], unit)
class TimeEstimator(object):
def __init__(self, cooldown=1):
self.cooldown = cooldown
self.start = time.time()
self.time_left = None
def update(self, bytes_read, total_size):
elapsed = time.time() - self.start
if elapsed > self.cooldown:
self.time_left = math.ceil(elapsed * total_size /
bytes_read - elapsed)
def format(self):
if self.time_left is None:
return None
res = "eta "
if self.time_left / 60 >= 1:
res += "%dm " % (self.time_left / 60)
return res + "%ds" % (self.time_left % 60)
def format_bytes_read(bytes_read, unit="MB"):
divisor = {'MB': 1048576, 'kB': 1024}
return "%0.2f%s" % (float(bytes_read) / divisor[unit], unit)
def format_percent(bytes_read, total_size):
percent = round(bytes_read * 100.0 / total_size, 2)
return "%0.2f%%" % percent
def get_content_range(response):
content_range = response.headers.get('Content-Range', "").strip()
if content_range:
m = re.match(r"bytes (\d+)-(\d+)/(\d+)", content_range)
if m:
return [int(v) for v in m.groups()]
def get_content_length(response):
if 'Content-Length' not in response.headers:
raise UnknownContentLengthException
return int(response.headers.get('Content-Length').strip())
def get_url_meta(url, checksum_header=None):
class HeadRequest(Request):
def get_method(self):
return "HEAD"
r = urlopen(HeadRequest(url))
res = {'size': get_content_length(r)}
if checksum_header:
value = r.headers.get(checksum_header)
if value:
res['checksum'] = value
r.close()
return res
def progress(console, bytes_read, total_size, transfer_rate, eta):
fields = [
format_bytes_read(bytes_read),
format_percent(bytes_read, total_size),
transfer_rate.format(),
eta.format(),
" " * 10,
]
console.write("Downloaded %s\r" % " ".join(filter(None, fields)))
console.flush()
def read_request(request, offset=0, console=None,
progress_func=None, write_func=None):
# support partial downloads
if offset > 0:
request.add_header('Range', "bytes=%s-" % offset)
try:
response = urlopen(request)
except HTTPError as e:
if e.code == 416: # Requested Range Not Satisfiable
raise InvalidOffsetException
# TODO add http error handling here
raise UnsupportedHTTPCodeException(e.code)
total_size = get_content_length(response) + offset
bytes_read = offset
# sanity checks
if response.code == 200: # OK
assert offset == 0
elif response.code == 206: # Partial content
range_start, range_end, range_total = get_content_range(response)
assert range_start == offset
assert range_total == total_size
assert range_end + 1 - range_start == total_size - bytes_read
else:
raise UnsupportedHTTPCodeException(response.code)
eta = TimeEstimator()
transfer_rate = RateSampler()
if console:
if offset > 0:
console.write("Continue downloading...\n")
else:
console.write("Downloading...\n")
while True:
with transfer_rate:
chunk = response.read(CHUNK_SIZE)
if not chunk:
if progress_func and console:
console.write('\n')
break
bytes_read += len(chunk)
transfer_rate.update(len(chunk))
eta.update(bytes_read - offset, total_size - offset)
if progress_func and console:
progress_func(console, bytes_read, total_size, transfer_rate, eta)
if write_func:
write_func(chunk)
response.close()
assert bytes_read == total_size
return response
def download(url, path=".",
checksum=None, checksum_header=None,
headers=None, console=None):
if os.path.isdir(path):
path = os.path.join(path, url.rsplit('/', 1)[1])
path = os.path.abspath(path)
with io.open(path, "a+b") as f:
size = f.tell()
# update checksum of partially downloaded file
if checksum:
f.seek(0, os.SEEK_SET)
for chunk in iter(lambda: f.read(CHUNK_SIZE), b""):
checksum.update(chunk)
def write(chunk):
if checksum:
checksum.update(chunk)
f.write(chunk)
request = Request(url)
# request headers
if headers:
for key, value in headers.items():
request.add_header(key, value)
try:
response = read_request(request,
offset=size,
console=console,
progress_func=progress,
write_func=write)
except InvalidOffsetException:
response = None
if checksum:
if response:
origin_checksum = response.headers.get(checksum_header)
else:
# check whether file is already complete
meta = get_url_meta(url, checksum_header)
origin_checksum = meta.get('checksum')
if origin_checksum is None:
raise MissingChecksumHeader
if checksum.hexdigest() != origin_checksum:
raise InvalidChecksumException
if console:
console.write("checksum/sha256 OK\n")
return path