Create a method to iterate downloads

2025-12-06 09:43:53 +03:00 · 2019-05-21 16:16:16 +02:00 · 2019-05-21 16:16:16 +02:00 · 0b0f8f4285
commit 0b0f8f4285
parent a43830d403
1 changed files with 279 additions and 0 deletions
--- a/telethon/client/downloads.py
+++ b/telethon/client/downloads.py
@ -6,6 +6,7 @@ import typing

 from .users import UserMethods
 from .. import utils, helpers, errors, hints
+from ..requestiter import RequestIter
 from ..tl import TLObject, types, functions

 try:
@ -17,6 +18,140 @@ if typing.TYPE_CHECKING:
    from .telegramclient import TelegramClient


+# Chunk sizes for upload.getFile must be multiples of the smallest size
+MIN_CHUNK_SIZE = 4096
+MAX_CHUNK_SIZE = 512 * 1024
+
+
+class _DirectDownloadIter(RequestIter):
+    async def _init(
+            self, file, dc_id, offset, stride, chunk_size, request_size, file_size
+    ):
+        self.request = functions.upload.GetFileRequest(
+            file, offset=offset, limit=request_size)
+
+        self.total = file_size
+        self._stride = stride
+        self._chunk_size = chunk_size
+        self._last_part = None
+
+        self._exported = dc_id and self.client.session.dc_id != dc_id
+        if not self._exported:
+            # The used sender will also change if ``FileMigrateError`` occurs
+            self._sender = self.client._sender
+        else:
+            try:
+                self._sender = await self.client._borrow_exported_sender(dc_id)
+            except errors.DcIdInvalidError:
+                # Can't export a sender for the ID we are currently in
+                config = await self.client(functions.help.GetConfigRequest())
+                for option in config.dc_options:
+                    if option.ip_address == self.client.session.server_address:
+                        self.client.session.set_dc(
+                            option.id, option.ip_address, option.port)
+                        self.client.session.save()
+                        break
+
+                # TODO Figure out why the session may have the wrong DC ID
+                self._sender = self.client._sender
+                self._exported = False
+
+    async def _load_next_chunk(self):
+        cur = await self._request()
+        self.buffer.append(cur)
+        if len(cur) < self.request.limit:
+            self.left = len(self.buffer)
+            await self.close()
+        else:
+            self.request.offset += self._stride
+
+    async def _request(self):
+        try:
+            result = await self._sender.send(self.request)
+            if isinstance(result, types.upload.FileCdnRedirect):
+                raise NotImplementedError  # TODO Implement
+            else:
+                return result.bytes
+
+        except errors.FileMigrateError as e:
+            self.client._log[__name__].info('File lives in another DC')
+            self._sender = await self.client._borrow_exported_sender(e.new_dc)
+            self._exported = True
+            return await self._request()
+
+    async def close(self):
+        if not self._sender:
+            return
+
+        try:
+            if self._exported:
+                await self.client._return_exported_sender(self._sender)
+            elif self._sender != self.client._sender:
+                await self._sender.disconnect()
+        finally:
+            self._sender = None
+
+    async def __aenter__(self):
+        pass
+
+    async def __aexit__(self, *args):
+        await self.close()
+
+    __enter__ = helpers._sync_enter
+    __exit__ = helpers._sync_exit
+
+
+class _GenericDownloadIter(_DirectDownloadIter):
+    async def _load_next_chunk(self, mask=MIN_CHUNK_SIZE - 1):
+        # 1. Fetch enough for one chunk
+        data = b''
+
+        # 1.1. ``bad`` is how much into the data we have we need to offset
+        bad = self.request.offset & mask
+        before = self.request.offset
+
+        # 1.2. We have to fetch from a valid offset, so remove that bad part
+        self.request.offset -= bad
+
+        done = False
+        while not done and len(data) - bad < self._chunk_size:
+            cur = await self._request()
+            self.request.offset += self.request.limit
+
+            data += cur
+            done = len(cur) < self.request.limit
+
+        # 1.3 Restore our last desired offset
+        self.request.offset = before
+
+        # 2. Fill the buffer with the data we have
+        # 2.1. Slicing ``bytes`` is expensive, yield ``memoryview`` instead
+        mem = memoryview(data)
+
+        # 2.2. The current chunk starts at ``bad`` offset into the data,
+        #      and each new chunk is ``stride`` bytes apart of the other
+        for i in range(bad, len(data), self._stride):
+            self.buffer.append(mem[i:i + self._chunk_size])
+
+            # 2.3. We will yield this offset, so move to the next one
+            self.request.offset += self._stride
+
+        # 2.4. If we are in the last chunk, we will return the last partial data
+        if done:
+            self.left = len(self.buffer)
+            await self.close()
+            return
+
+        # 2.5. If we are not done, we can't return incomplete chunks.
+        if len(self.buffer[-1]) != self._chunk_size:
+            self._last_part = self.buffer.pop().tobytes()
+
+            # 3. Be careful with the offsets. Re-fetching a bit of data
+            #    is fine, since it greatly simplifies things.
+            # TODO Try to not re-fetch data
+            self.request.offset -= self._stride
+
+
 class DownloadMethods(UserMethods):

    # region Public methods
@ -366,6 +501,150 @@ class DownloadMethods(UserMethods):
            if isinstance(file, str) or in_memory:
                f.close()

+    def iter_download(
+            self: 'TelegramClient',
+            file: 'hints.FileLike',
+            *,
+            offset: int = 0,
+            stride: int = None,
+            limit: int = None,
+            chunk_size: int = None,
+            request_size: int = MAX_CHUNK_SIZE,
+            file_size: int = None,
+            dc_id: int = None
+    ):
+        """
+        Iterates over a file download, yielding chunks of the file.
+
+        This method can be used to stream files in a more convenient
+        way, since it offers more control (pausing, resuming, etc.)
+
+        .. note::
+
+            Using a value for `offset` or `stride` which is not a multiple
+            of the minimum allowed `request_size`, or if `chunk_size` is
+            different from `request_size`, the library will need to do a
+            bit more work to fetch the data in the way you intend it to.
+
+            You normally shouldn't worry about this.
+
+        Arguments
+            file (`hints.FileLike`):
+                The file of which contents you want to iterate over.
+
+            offset (`int`, optional):
+                The offset in bytes into the file from where the
+                download should start. For example, if a file is
+                1024KB long and you just want the last 512KB, you
+                would use ``offset=512 * 1024``.
+
+            stride (`int`, optional):
+                The stride of each chunk (how much the offset should
+                advance between reading each chunk). This parameter
+                should only be used for more advanced use cases.
+
+                It must be bigger than or equal to the `chunk_size`.
+
+            limit (`int`, optional):
+                The limit for how many *chunks* will be yielded at most.
+
+            chunk_size (`int`, optional):
+                The maximum size of the chunks that will be yielded.
+                Note that the last chunk may be less than this value.
+                By default, it equals to `request_size`.
+
+            request_size (`int`, optional):
+                How many bytes will be requested to Telegram when more
+                data is required. By default, as many bytes as possible
+                are requested. If you would like to request data in
+                smaller sizes, adjust this parameter.
+
+                Note that values outside the valid range will be clamped,
+                and the final value will also be a multiple of the minimum
+                allowed size.
+
+            file_size (`int`, optional):
+                If the file size is known beforehand, you should set
+                this parameter to said value. Depending on the type of
+                the input file passed, this may be set automatically.
+
+            dc_id (`int`, optional):
+                The data center the library should connect to in order
+                to download the file. You shouldn't worry about this.
+
+        Yields
+
+            ``bytes`` objects representing the chunks of the file if the
+            right conditions are met, or ``memoryview`` objects instead.
+
+        Example
+            .. code-block:: python
+
+                # Streaming `media` to an output file
+                # After the iteration ends, the sender is cleaned up
+                with open('photo.jpg', 'wb') as fd:
+                    for chunk client.iter_download(media):
+                        fd.write(chunk)
+
+                # Fetching only the header of a file (32 bytes)
+                # You should manually close the iterator in this case.
+                stream = client.iter_download(media, request_size=32)
+                header = next(stream)
+                stream.close()
+                assert len(header) == 32
+
+                # Fetching only the header, inside of an ``async def``
+                async def main():
+                    stream = client.iter_download(media, request_size=32)
+                    header = await stream.__anext__()
+                    await stream.close()
+                    assert len(header) == 32
+        """
+        if chunk_size is None:
+            chunk_size = request_size
+
+        if limit is None and file_size is not None:
+            limit = (file_size + chunk_size - 1) // chunk_size
+
+        if stride is None:
+            stride = chunk_size
+        elif stride < chunk_size:
+            raise ValueError('stride must be >= chunk_size')
+
+        request_size -= request_size % MIN_CHUNK_SIZE
+        if request_size < MIN_CHUNK_SIZE:
+            request_size = MIN_CHUNK_SIZE
+        elif request_size > MAX_CHUNK_SIZE:
+            request_size = MAX_CHUNK_SIZE
+
+        old_dc = dc_id
+        dc_id, file = utils.get_input_location(file)
+        if dc_id is None:
+            dc_id = old_dc
+
+        if chunk_size == request_size \
+                and offset % MIN_CHUNK_SIZE == 0 \
+                and stride % MIN_CHUNK_SIZE == 0:
+            cls = _DirectDownloadIter
+            self._log[__name__].info('Starting direct file download in chunks of '
+                                     '%d at %d, stride %d', request_size, offset, stride)
+        else:
+            cls = _GenericDownloadIter
+            self._log[__name__].info('Starting indirect file download in chunks of '
+                                     '%d at %d, stride %d', request_size, offset, stride)
+
+        return cls(
+            self,
+            limit,
+            file=file,
+            dc_id=dc_id,
+            offset=offset,
+            stride=stride,
+            chunk_size=chunk_size,
+            request_size=request_size,
+            file_size=file_size
+        )
+
    # endregion

    # region Private methods