mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			247 lines
		
	
	
		
			6.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			247 lines
		
	
	
		
			6.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os
 | |
| import time
 | |
| import io
 | |
| import math
 | |
| import re
 | |
| 
 | |
| try:
 | |
|     from urllib.parse import urlparse
 | |
|     from urllib.request import urlopen, Request
 | |
|     from urllib.error import HTTPError
 | |
| except ImportError:
 | |
|     from urllib2 import urlopen, urlparse, Request, HTTPError
 | |
| 
 | |
| 
 | |
| class UnknownContentLengthException(Exception): pass
 | |
| class InvalidChecksumException(Exception): pass
 | |
| class UnsupportedHTTPCodeException(Exception): pass
 | |
| class InvalidOffsetException(Exception): pass
 | |
| class MissingChecksumHeader(Exception): pass
 | |
| 
 | |
| 
 | |
| CHUNK_SIZE = 16 * 1024
 | |
| 
 | |
| 
 | |
| class RateSampler(object):
 | |
|     def __init__(self, period=1):
 | |
|         self.rate = None
 | |
|         self.reset = True
 | |
|         self.period = period
 | |
| 
 | |
|     def __enter__(self):
 | |
|         if self.reset:
 | |
|             self.reset = False
 | |
|             self.start = time.time()
 | |
|             self.counter = 0
 | |
| 
 | |
|     def __exit__(self, type, value, traceback):
 | |
|         elapsed = time.time() - self.start
 | |
|         if elapsed >= self.period:
 | |
|             self.reset = True
 | |
|             self.rate = float(self.counter) / elapsed
 | |
| 
 | |
|     def update(self, value):
 | |
|         self.counter += value
 | |
| 
 | |
|     def format(self, unit="MB"):
 | |
|         if self.rate is None:
 | |
|             return None
 | |
| 
 | |
|         divisor = {'MB': 1048576, 'kB': 1024}
 | |
|         return "%0.2f%s/s" % (self.rate / divisor[unit], unit)
 | |
| 
 | |
| 
 | |
| class TimeEstimator(object):
 | |
|     def __init__(self, cooldown=1):
 | |
|         self.cooldown = cooldown
 | |
|         self.start = time.time()
 | |
|         self.time_left = None
 | |
| 
 | |
|     def update(self, bytes_read, total_size):
 | |
|         elapsed = time.time() - self.start
 | |
|         if elapsed > self.cooldown:
 | |
|             self.time_left = math.ceil(elapsed * total_size /
 | |
|                                        bytes_read - elapsed)
 | |
| 
 | |
|     def format(self):
 | |
|         if self.time_left is None:
 | |
|             return None
 | |
| 
 | |
|         res = "eta "
 | |
|         if self.time_left / 60 >= 1:
 | |
|             res += "%dm " % (self.time_left / 60)
 | |
|         return res + "%ds" % (self.time_left % 60)
 | |
| 
 | |
| 
 | |
| def format_bytes_read(bytes_read, unit="MB"):
 | |
|     divisor = {'MB': 1048576, 'kB': 1024}
 | |
|     return "%0.2f%s" % (float(bytes_read) / divisor[unit], unit)
 | |
| 
 | |
| 
 | |
| def format_percent(bytes_read, total_size):
 | |
|     percent = round(bytes_read * 100.0 / total_size, 2)
 | |
|     return "%0.2f%%" % percent
 | |
| 
 | |
| 
 | |
| def get_content_range(response):
 | |
|     content_range = response.headers.get('Content-Range', "").strip()
 | |
|     if content_range:
 | |
|         m = re.match(r"bytes (\d+)-(\d+)/(\d+)", content_range)
 | |
|         if m:
 | |
|             return [int(v) for v in m.groups()]
 | |
| 
 | |
| 
 | |
| def get_content_length(response):
 | |
|     if 'Content-Length' not in response.headers:
 | |
|         raise UnknownContentLengthException
 | |
|     return int(response.headers.get('Content-Length').strip())
 | |
| 
 | |
| 
 | |
| def get_url_meta(url, checksum_header=None):
 | |
|     class HeadRequest(Request):
 | |
|         def get_method(self):
 | |
|             return "HEAD"
 | |
| 
 | |
|     r = urlopen(HeadRequest(url))
 | |
|     res = {'size': get_content_length(r)}
 | |
| 
 | |
|     if checksum_header:
 | |
|         value = r.headers.get(checksum_header)
 | |
|         if value:
 | |
|             res['checksum'] = value
 | |
| 
 | |
|     r.close()
 | |
|     return res
 | |
| 
 | |
| 
 | |
| def progress(console, bytes_read, total_size, transfer_rate, eta):
 | |
|     fields = [
 | |
|         format_bytes_read(bytes_read),
 | |
|         format_percent(bytes_read, total_size),
 | |
|         transfer_rate.format(),
 | |
|         eta.format(),
 | |
|         " " * 10,
 | |
|     ]
 | |
|     console.write("Downloaded %s\r" % " ".join(filter(None, fields)))
 | |
|     console.flush()
 | |
| 
 | |
| 
 | |
| def read_request(request, offset=0, console=None,
 | |
|                  progress_func=None, write_func=None):
 | |
|     # support partial downloads
 | |
|     if offset > 0:
 | |
|         request.add_header('Range', "bytes=%s-" % offset)
 | |
| 
 | |
|     try:
 | |
|         response = urlopen(request)
 | |
|     except HTTPError as e:
 | |
|         if e.code == 416:  # Requested Range Not Satisfiable
 | |
|             raise InvalidOffsetException
 | |
| 
 | |
|         # TODO add http error handling here
 | |
|         raise UnsupportedHTTPCodeException(e.code)
 | |
| 
 | |
|     total_size = get_content_length(response) + offset
 | |
|     bytes_read = offset
 | |
| 
 | |
|     # sanity checks
 | |
|     if response.code == 200:  # OK
 | |
|         assert offset == 0
 | |
|     elif response.code == 206:  # Partial content
 | |
|         range_start, range_end, range_total = get_content_range(response)
 | |
|         assert range_start == offset
 | |
|         assert range_total == total_size
 | |
|         assert range_end + 1 - range_start == total_size - bytes_read
 | |
|     else:
 | |
|         raise UnsupportedHTTPCodeException(response.code)
 | |
| 
 | |
|     eta = TimeEstimator()
 | |
|     transfer_rate = RateSampler()
 | |
| 
 | |
|     if console:
 | |
|         if offset > 0:
 | |
|             console.write("Continue downloading...\n")
 | |
|         else:
 | |
|             console.write("Downloading...\n")
 | |
| 
 | |
|     while True:
 | |
|         with transfer_rate:
 | |
|             chunk = response.read(CHUNK_SIZE)
 | |
|             if not chunk:
 | |
|                 if progress_func and console:
 | |
|                     console.write('\n')
 | |
|                 break
 | |
| 
 | |
|             bytes_read += len(chunk)
 | |
| 
 | |
|             transfer_rate.update(len(chunk))
 | |
|             eta.update(bytes_read - offset, total_size - offset)
 | |
| 
 | |
|         if progress_func and console:
 | |
|             progress_func(console, bytes_read, total_size, transfer_rate, eta)
 | |
| 
 | |
|         if write_func:
 | |
|             write_func(chunk)
 | |
| 
 | |
|     response.close()
 | |
|     assert bytes_read == total_size
 | |
|     return response
 | |
| 
 | |
| 
 | |
| def download(url, path=".",
 | |
|              checksum=None, checksum_header=None,
 | |
|              headers=None, console=None):
 | |
| 
 | |
|     if os.path.isdir(path):
 | |
|         path = os.path.join(path, url.rsplit('/', 1)[1])
 | |
|     path = os.path.abspath(path)
 | |
| 
 | |
|     with io.open(path, "a+b") as f:
 | |
|         size = f.tell()
 | |
| 
 | |
|         # update checksum of partially downloaded file
 | |
|         if checksum:
 | |
|             f.seek(0, os.SEEK_SET)
 | |
|             for chunk in iter(lambda: f.read(CHUNK_SIZE), b""):
 | |
|                 checksum.update(chunk)
 | |
| 
 | |
|         def write(chunk):
 | |
|             if checksum:
 | |
|                 checksum.update(chunk)
 | |
|             f.write(chunk)
 | |
| 
 | |
|         request = Request(url)
 | |
| 
 | |
|         # request headers
 | |
|         if headers:
 | |
|             for key, value in headers.items():
 | |
|                 request.add_header(key, value)
 | |
| 
 | |
|         try:
 | |
|             response = read_request(request,
 | |
|                                     offset=size,
 | |
|                                     console=console,
 | |
|                                     progress_func=progress,
 | |
|                                     write_func=write)
 | |
|         except InvalidOffsetException:
 | |
|             response = None
 | |
| 
 | |
|         if checksum:
 | |
|             if response:
 | |
|                 origin_checksum = response.headers.get(checksum_header)
 | |
|             else:
 | |
|                 # check whether file is already complete
 | |
|                 meta = get_url_meta(url, checksum_header)
 | |
|                 origin_checksum = meta.get('checksum')
 | |
| 
 | |
|             if origin_checksum is None:
 | |
|                 raise MissingChecksumHeader
 | |
| 
 | |
|             if checksum.hexdigest() != origin_checksum:
 | |
|                 raise InvalidChecksumException
 | |
| 
 | |
|             if console:
 | |
|                 console.write("checksum/sha256 OK\n")
 | |
| 
 | |
|     return path
 |