Remove unused data and download script

2025-07-31 10:29:46 +03:00 · 2016-12-08 20:39:49 +01:00 · 2016-12-08 20:39:49 +01:00 · c0c5f31950
commit c0c5f31950
parent 0a6d529104
2 changed files with 0 additions and 293 deletions
--- a/spacy/en/regexes.py
+++ b/spacy/en/regexes.py
@ -1,47 +0,0 @@
-import re
-
-
-_mw_prepositions = [
-    'close to',
-    'down by',
-    'on the way to',
-    'on my way to',
-    'on my way',
-    'on his way to',
-    'on his way',
-    'on her way to',
-    'on her way',
-    'on your way to',
-    'on your way',
-    'on our way to',
-    'on our way',
-    'on their way to',
-    'on their way',
-    'along the route from'
-]
-
-
-MW_PREPOSITIONS_RE = re.compile('|'.join(_mw_prepositions), flags=re.IGNORECASE)
-
-
-TIME_RE = re.compile(
-    '{colon_digits}|{colon_digits} ?{am_pm}?|{one_two_digits} ?({am_pm})'.format(
-        colon_digits=r'[0-2]?[0-9]:[0-5][0-9](?::[0-5][0-9])?',
-        one_two_digits=r'[0-2]?[0-9]',
-        am_pm=r'[ap]\.?m\.?'))
-
-DATE_RE = re.compile(
-    '(?:this|last|next|the) (?:week|weekend|{days})'.format(
-        days='Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday'
-    ))
-
-
-MONEY_RE = re.compile('\$\d+(?:\.\d+)?|\d+ dollars(?: \d+ cents)?')
-
-
-DAYS_RE = re.compile('Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday')
-
-
-REGEXES = [('IN', 'O', MW_PREPOSITIONS_RE), ('CD', 'TIME', TIME_RE),
-           ('NNP', 'DATE', DATE_RE),
-           ('NNP', 'DATE', DAYS_RE), ('CD', 'MONEY', MONEY_RE)]
--- a/spacy/en/uget.py
+++ b/spacy/en/uget.py
@ -1,246 +0,0 @@
-import os
-import time
-import io
-import math
-import re
-
-try:
-    from urllib.parse import urlparse
-    from urllib.request import urlopen, Request
-    from urllib.error import HTTPError
-except ImportError:
-    from urllib2 import urlopen, urlparse, Request, HTTPError
-
-
-class UnknownContentLengthException(Exception): pass
-class InvalidChecksumException(Exception): pass
-class UnsupportedHTTPCodeException(Exception): pass
-class InvalidOffsetException(Exception): pass
-class MissingChecksumHeader(Exception): pass
-
-
-CHUNK_SIZE = 16 * 1024
-
-
-class RateSampler(object):
-    def __init__(self, period=1):
-        self.rate = None
-        self.reset = True
-        self.period = period
-
-    def __enter__(self):
-        if self.reset:
-            self.reset = False
-            self.start = time.time()
-            self.counter = 0
-
-    def __exit__(self, type, value, traceback):
-        elapsed = time.time() - self.start
-        if elapsed >= self.period:
-            self.reset = True
-            self.rate = float(self.counter) / elapsed
-
-    def update(self, value):
-        self.counter += value
-
-    def format(self, unit="MB"):
-        if self.rate is None:
-            return None
-
-        divisor = {'MB': 1048576, 'kB': 1024}
-        return "%0.2f%s/s" % (self.rate / divisor[unit], unit)
-
-
-class TimeEstimator(object):
-    def __init__(self, cooldown=1):
-        self.cooldown = cooldown
-        self.start = time.time()
-        self.time_left = None
-
-    def update(self, bytes_read, total_size):
-        elapsed = time.time() - self.start
-        if elapsed > self.cooldown:
-            self.time_left = math.ceil(elapsed * total_size /
-                                       bytes_read - elapsed)
-
-    def format(self):
-        if self.time_left is None:
-            return None
-
-        res = "eta "
-        if self.time_left / 60 >= 1:
-            res += "%dm " % (self.time_left / 60)
-        return res + "%ds" % (self.time_left % 60)
-
-
-def format_bytes_read(bytes_read, unit="MB"):
-    divisor = {'MB': 1048576, 'kB': 1024}
-    return "%0.2f%s" % (float(bytes_read) / divisor[unit], unit)
-
-
-def format_percent(bytes_read, total_size):
-    percent = round(bytes_read * 100.0 / total_size, 2)
-    return "%0.2f%%" % percent
-
-
-def get_content_range(response):
-    content_range = response.headers.get('Content-Range', "").strip()
-    if content_range:
-        m = re.match(r"bytes (\d+)-(\d+)/(\d+)", content_range)
-        if m:
-            return [int(v) for v in m.groups()]
-
-
-def get_content_length(response):
-    if 'Content-Length' not in response.headers:
-        raise UnknownContentLengthException
-    return int(response.headers.get('Content-Length').strip())
-
-
-def get_url_meta(url, checksum_header=None):
-    class HeadRequest(Request):
-        def get_method(self):
-            return "HEAD"
-
-    r = urlopen(HeadRequest(url))
-    res = {'size': get_content_length(r)}
-
-    if checksum_header:
-        value = r.headers.get(checksum_header)
-        if value:
-            res['checksum'] = value
-
-    r.close()
-    return res
-
-
-def progress(console, bytes_read, total_size, transfer_rate, eta):
-    fields = [
-        format_bytes_read(bytes_read),
-        format_percent(bytes_read, total_size),
-        transfer_rate.format(),
-        eta.format(),
-        " " * 10,
-    ]
-    console.write("Downloaded %s\r" % " ".join(filter(None, fields)))
-    console.flush()
-
-
-def read_request(request, offset=0, console=None,
-                 progress_func=None, write_func=None):
-    # support partial downloads
-    if offset > 0:
-        request.add_header('Range', "bytes=%s-" % offset)
-
-    try:
-        response = urlopen(request)
-    except HTTPError as e:
-        if e.code == 416:  # Requested Range Not Satisfiable
-            raise InvalidOffsetException
-
-        # TODO add http error handling here
-        raise UnsupportedHTTPCodeException(e.code)
-
-    total_size = get_content_length(response) + offset
-    bytes_read = offset
-
-    # sanity checks
-    if response.code == 200:  # OK
-        assert offset == 0
-    elif response.code == 206:  # Partial content
-        range_start, range_end, range_total = get_content_range(response)
-        assert range_start == offset
-        assert range_total == total_size
-        assert range_end + 1 - range_start == total_size - bytes_read
-    else:
-        raise UnsupportedHTTPCodeException(response.code)
-
-    eta = TimeEstimator()
-    transfer_rate = RateSampler()
-
-    if console:
-        if offset > 0:
-            console.write("Continue downloading...\n")
-        else:
-            console.write("Downloading...\n")
-
-    while True:
-        with transfer_rate:
-            chunk = response.read(CHUNK_SIZE)
-            if not chunk:
-                if progress_func and console:
-                    console.write('\n')
-                break
-
-            bytes_read += len(chunk)
-
-            transfer_rate.update(len(chunk))
-            eta.update(bytes_read - offset, total_size - offset)
-
-        if progress_func and console:
-            progress_func(console, bytes_read, total_size, transfer_rate, eta)
-
-        if write_func:
-            write_func(chunk)
-
-    response.close()
-    assert bytes_read == total_size
-    return response
-
-
-def download(url, path=".",
-             checksum=None, checksum_header=None,
-             headers=None, console=None):
-
-    if os.path.isdir(path):
-        path = os.path.join(path, url.rsplit('/', 1)[1])
-    path = os.path.abspath(path)
-
-    with io.open(path, "a+b") as f:
-        size = f.tell()
-
-        # update checksum of partially downloaded file
-        if checksum:
-            f.seek(0, os.SEEK_SET)
-            for chunk in iter(lambda: f.read(CHUNK_SIZE), b""):
-                checksum.update(chunk)
-
-        def write(chunk):
-            if checksum:
-                checksum.update(chunk)
-            f.write(chunk)
-
-        request = Request(url)
-
-        # request headers
-        if headers:
-            for key, value in headers.items():
-                request.add_header(key, value)
-
-        try:
-            response = read_request(request,
-                                    offset=size,
-                                    console=console,
-                                    progress_func=progress,
-                                    write_func=write)
-        except InvalidOffsetException:
-            response = None
-
-        if checksum:
-            if response:
-                origin_checksum = response.headers.get(checksum_header)
-            else:
-                # check whether file is already complete
-                meta = get_url_meta(url, checksum_header)
-                origin_checksum = meta.get('checksum')
-
-            if origin_checksum is None:
-                raise MissingChecksumHeader
-
-            if checksum.hexdigest() != origin_checksum:
-                raise InvalidChecksumException
-
-            if console:
-                console.write("checksum/sha256 OK\n")
-
-    return path