diff --git a/telethon/extensions/html.py b/telethon/extensions/html.py index 73c0ba38..12b8cc3c 100644 --- a/telethon/extensions/html.py +++ b/telethon/extensions/html.py @@ -6,11 +6,12 @@ from collections import deque from html import escape, unescape from html.parser import HTMLParser +from .. import helpers from ..tl.types import ( MessageEntityBold, MessageEntityItalic, MessageEntityCode, MessageEntityPre, MessageEntityEmail, MessageEntityUrl, MessageEntityTextUrl, MessageEntityMentionName - ) +) # Helpers from markdown.py @@ -126,7 +127,8 @@ def parse(html): parser = HTMLToTelegramParser() parser.feed(_add_surrogate(html)) - return _del_surrogate(parser.text), parser.entities + text = helpers.strip_text(parser.text, parser.entities) + return _del_surrogate(text), parser.entities def unparse(text, entities): diff --git a/telethon/extensions/markdown.py b/telethon/extensions/markdown.py index be0ea507..e139c4e7 100644 --- a/telethon/extensions/markdown.py +++ b/telethon/extensions/markdown.py @@ -5,12 +5,12 @@ since they seem to count as two characters and it's a bit strange. """ import re -from ..helpers import add_surrogate, del_surrogate +from ..helpers import add_surrogate, del_surrogate, strip_text from ..tl import TLObject from ..tl.types import ( MessageEntityBold, MessageEntityItalic, MessageEntityCode, MessageEntityPre, MessageEntityTextUrl, MessageEntityMentionName - ) +) DEFAULT_DELIMITERS = { '**': MessageEntityBold, @@ -125,6 +125,7 @@ def parse(message, delimiters=None, url_re=None): + message[current.offset:] ) + message = strip_text(message, result) return del_surrogate(message), result diff --git a/telethon/helpers.py b/telethon/helpers.py index 758344ae..05c9a625 100644 --- a/telethon/helpers.py +++ b/telethon/helpers.py @@ -33,6 +33,41 @@ def del_surrogate(text): return text.encode('utf-16', 'surrogatepass').decode('utf-16') +def strip_text(text, entities): + """ + Strips whitespace from the given text modifying the provided entities. + + This assumes that there are no overlapping entities, that their length + is greater or equal to one, and that their length is not out of bounds. + """ + if not entities: + return text.strip() + + while text and text[-1].isspace(): + e = entities[-1] + if e.offset + e.length == len(text): + if e.length == 1: + del entities[-1] + if not entities: + return text.strip() + else: + e.length -= 1 + text = text[:-1] + + while text and text[0].isspace(): + e = entities[0] + if e.offset == 0: + if e.length == 1: + del entities[0] + if not entities: + return text.lstrip() + else: + e.length -= 1 + text = text[1:] + + return text + + # endregion # region Cryptographic related utils