mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Remove unused data and download script
This commit is contained in:
parent
0a6d529104
commit
c0c5f31950
|
@ -1,47 +0,0 @@
|
|||
import re
|
||||
|
||||
|
||||
_mw_prepositions = [
|
||||
'close to',
|
||||
'down by',
|
||||
'on the way to',
|
||||
'on my way to',
|
||||
'on my way',
|
||||
'on his way to',
|
||||
'on his way',
|
||||
'on her way to',
|
||||
'on her way',
|
||||
'on your way to',
|
||||
'on your way',
|
||||
'on our way to',
|
||||
'on our way',
|
||||
'on their way to',
|
||||
'on their way',
|
||||
'along the route from'
|
||||
]
|
||||
|
||||
|
||||
MW_PREPOSITIONS_RE = re.compile('|'.join(_mw_prepositions), flags=re.IGNORECASE)
|
||||
|
||||
|
||||
TIME_RE = re.compile(
|
||||
'{colon_digits}|{colon_digits} ?{am_pm}?|{one_two_digits} ?({am_pm})'.format(
|
||||
colon_digits=r'[0-2]?[0-9]:[0-5][0-9](?::[0-5][0-9])?',
|
||||
one_two_digits=r'[0-2]?[0-9]',
|
||||
am_pm=r'[ap]\.?m\.?'))
|
||||
|
||||
DATE_RE = re.compile(
|
||||
'(?:this|last|next|the) (?:week|weekend|{days})'.format(
|
||||
days='Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday'
|
||||
))
|
||||
|
||||
|
||||
MONEY_RE = re.compile('\$\d+(?:\.\d+)?|\d+ dollars(?: \d+ cents)?')
|
||||
|
||||
|
||||
DAYS_RE = re.compile('Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday')
|
||||
|
||||
|
||||
REGEXES = [('IN', 'O', MW_PREPOSITIONS_RE), ('CD', 'TIME', TIME_RE),
|
||||
('NNP', 'DATE', DATE_RE),
|
||||
('NNP', 'DATE', DAYS_RE), ('CD', 'MONEY', MONEY_RE)]
|
246
spacy/en/uget.py
246
spacy/en/uget.py
|
@ -1,246 +0,0 @@
|
|||
import os
|
||||
import time
|
||||
import io
|
||||
import math
|
||||
import re
|
||||
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
from urllib.request import urlopen, Request
|
||||
from urllib.error import HTTPError
|
||||
except ImportError:
|
||||
from urllib2 import urlopen, urlparse, Request, HTTPError
|
||||
|
||||
|
||||
class UnknownContentLengthException(Exception): pass
|
||||
class InvalidChecksumException(Exception): pass
|
||||
class UnsupportedHTTPCodeException(Exception): pass
|
||||
class InvalidOffsetException(Exception): pass
|
||||
class MissingChecksumHeader(Exception): pass
|
||||
|
||||
|
||||
CHUNK_SIZE = 16 * 1024
|
||||
|
||||
|
||||
class RateSampler(object):
|
||||
def __init__(self, period=1):
|
||||
self.rate = None
|
||||
self.reset = True
|
||||
self.period = period
|
||||
|
||||
def __enter__(self):
|
||||
if self.reset:
|
||||
self.reset = False
|
||||
self.start = time.time()
|
||||
self.counter = 0
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
elapsed = time.time() - self.start
|
||||
if elapsed >= self.period:
|
||||
self.reset = True
|
||||
self.rate = float(self.counter) / elapsed
|
||||
|
||||
def update(self, value):
|
||||
self.counter += value
|
||||
|
||||
def format(self, unit="MB"):
|
||||
if self.rate is None:
|
||||
return None
|
||||
|
||||
divisor = {'MB': 1048576, 'kB': 1024}
|
||||
return "%0.2f%s/s" % (self.rate / divisor[unit], unit)
|
||||
|
||||
|
||||
class TimeEstimator(object):
|
||||
def __init__(self, cooldown=1):
|
||||
self.cooldown = cooldown
|
||||
self.start = time.time()
|
||||
self.time_left = None
|
||||
|
||||
def update(self, bytes_read, total_size):
|
||||
elapsed = time.time() - self.start
|
||||
if elapsed > self.cooldown:
|
||||
self.time_left = math.ceil(elapsed * total_size /
|
||||
bytes_read - elapsed)
|
||||
|
||||
def format(self):
|
||||
if self.time_left is None:
|
||||
return None
|
||||
|
||||
res = "eta "
|
||||
if self.time_left / 60 >= 1:
|
||||
res += "%dm " % (self.time_left / 60)
|
||||
return res + "%ds" % (self.time_left % 60)
|
||||
|
||||
|
||||
def format_bytes_read(bytes_read, unit="MB"):
|
||||
divisor = {'MB': 1048576, 'kB': 1024}
|
||||
return "%0.2f%s" % (float(bytes_read) / divisor[unit], unit)
|
||||
|
||||
|
||||
def format_percent(bytes_read, total_size):
|
||||
percent = round(bytes_read * 100.0 / total_size, 2)
|
||||
return "%0.2f%%" % percent
|
||||
|
||||
|
||||
def get_content_range(response):
|
||||
content_range = response.headers.get('Content-Range', "").strip()
|
||||
if content_range:
|
||||
m = re.match(r"bytes (\d+)-(\d+)/(\d+)", content_range)
|
||||
if m:
|
||||
return [int(v) for v in m.groups()]
|
||||
|
||||
|
||||
def get_content_length(response):
|
||||
if 'Content-Length' not in response.headers:
|
||||
raise UnknownContentLengthException
|
||||
return int(response.headers.get('Content-Length').strip())
|
||||
|
||||
|
||||
def get_url_meta(url, checksum_header=None):
|
||||
class HeadRequest(Request):
|
||||
def get_method(self):
|
||||
return "HEAD"
|
||||
|
||||
r = urlopen(HeadRequest(url))
|
||||
res = {'size': get_content_length(r)}
|
||||
|
||||
if checksum_header:
|
||||
value = r.headers.get(checksum_header)
|
||||
if value:
|
||||
res['checksum'] = value
|
||||
|
||||
r.close()
|
||||
return res
|
||||
|
||||
|
||||
def progress(console, bytes_read, total_size, transfer_rate, eta):
|
||||
fields = [
|
||||
format_bytes_read(bytes_read),
|
||||
format_percent(bytes_read, total_size),
|
||||
transfer_rate.format(),
|
||||
eta.format(),
|
||||
" " * 10,
|
||||
]
|
||||
console.write("Downloaded %s\r" % " ".join(filter(None, fields)))
|
||||
console.flush()
|
||||
|
||||
|
||||
def read_request(request, offset=0, console=None,
|
||||
progress_func=None, write_func=None):
|
||||
# support partial downloads
|
||||
if offset > 0:
|
||||
request.add_header('Range', "bytes=%s-" % offset)
|
||||
|
||||
try:
|
||||
response = urlopen(request)
|
||||
except HTTPError as e:
|
||||
if e.code == 416: # Requested Range Not Satisfiable
|
||||
raise InvalidOffsetException
|
||||
|
||||
# TODO add http error handling here
|
||||
raise UnsupportedHTTPCodeException(e.code)
|
||||
|
||||
total_size = get_content_length(response) + offset
|
||||
bytes_read = offset
|
||||
|
||||
# sanity checks
|
||||
if response.code == 200: # OK
|
||||
assert offset == 0
|
||||
elif response.code == 206: # Partial content
|
||||
range_start, range_end, range_total = get_content_range(response)
|
||||
assert range_start == offset
|
||||
assert range_total == total_size
|
||||
assert range_end + 1 - range_start == total_size - bytes_read
|
||||
else:
|
||||
raise UnsupportedHTTPCodeException(response.code)
|
||||
|
||||
eta = TimeEstimator()
|
||||
transfer_rate = RateSampler()
|
||||
|
||||
if console:
|
||||
if offset > 0:
|
||||
console.write("Continue downloading...\n")
|
||||
else:
|
||||
console.write("Downloading...\n")
|
||||
|
||||
while True:
|
||||
with transfer_rate:
|
||||
chunk = response.read(CHUNK_SIZE)
|
||||
if not chunk:
|
||||
if progress_func and console:
|
||||
console.write('\n')
|
||||
break
|
||||
|
||||
bytes_read += len(chunk)
|
||||
|
||||
transfer_rate.update(len(chunk))
|
||||
eta.update(bytes_read - offset, total_size - offset)
|
||||
|
||||
if progress_func and console:
|
||||
progress_func(console, bytes_read, total_size, transfer_rate, eta)
|
||||
|
||||
if write_func:
|
||||
write_func(chunk)
|
||||
|
||||
response.close()
|
||||
assert bytes_read == total_size
|
||||
return response
|
||||
|
||||
|
||||
def download(url, path=".",
|
||||
checksum=None, checksum_header=None,
|
||||
headers=None, console=None):
|
||||
|
||||
if os.path.isdir(path):
|
||||
path = os.path.join(path, url.rsplit('/', 1)[1])
|
||||
path = os.path.abspath(path)
|
||||
|
||||
with io.open(path, "a+b") as f:
|
||||
size = f.tell()
|
||||
|
||||
# update checksum of partially downloaded file
|
||||
if checksum:
|
||||
f.seek(0, os.SEEK_SET)
|
||||
for chunk in iter(lambda: f.read(CHUNK_SIZE), b""):
|
||||
checksum.update(chunk)
|
||||
|
||||
def write(chunk):
|
||||
if checksum:
|
||||
checksum.update(chunk)
|
||||
f.write(chunk)
|
||||
|
||||
request = Request(url)
|
||||
|
||||
# request headers
|
||||
if headers:
|
||||
for key, value in headers.items():
|
||||
request.add_header(key, value)
|
||||
|
||||
try:
|
||||
response = read_request(request,
|
||||
offset=size,
|
||||
console=console,
|
||||
progress_func=progress,
|
||||
write_func=write)
|
||||
except InvalidOffsetException:
|
||||
response = None
|
||||
|
||||
if checksum:
|
||||
if response:
|
||||
origin_checksum = response.headers.get(checksum_header)
|
||||
else:
|
||||
# check whether file is already complete
|
||||
meta = get_url_meta(url, checksum_header)
|
||||
origin_checksum = meta.get('checksum')
|
||||
|
||||
if origin_checksum is None:
|
||||
raise MissingChecksumHeader
|
||||
|
||||
if checksum.hexdigest() != origin_checksum:
|
||||
raise InvalidChecksumException
|
||||
|
||||
if console:
|
||||
console.write("checksum/sha256 OK\n")
|
||||
|
||||
return path
|
Loading…
Reference in New Issue
Block a user