diff --git a/telethon/extensions/html.py b/telethon/extensions/html.py
index 73c0ba38..12b8cc3c 100644
--- a/telethon/extensions/html.py
+++ b/telethon/extensions/html.py
@@ -6,11 +6,12 @@ from collections import deque
from html import escape, unescape
from html.parser import HTMLParser
+from .. import helpers
from ..tl.types import (
MessageEntityBold, MessageEntityItalic, MessageEntityCode,
MessageEntityPre, MessageEntityEmail, MessageEntityUrl,
MessageEntityTextUrl, MessageEntityMentionName
- )
+)
# Helpers from markdown.py
@@ -126,7 +127,8 @@ def parse(html):
parser = HTMLToTelegramParser()
parser.feed(_add_surrogate(html))
- return _del_surrogate(parser.text), parser.entities
+ text = helpers.strip_text(parser.text, parser.entities)
+ return _del_surrogate(text), parser.entities
def unparse(text, entities):
diff --git a/telethon/extensions/markdown.py b/telethon/extensions/markdown.py
index be0ea507..e139c4e7 100644
--- a/telethon/extensions/markdown.py
+++ b/telethon/extensions/markdown.py
@@ -5,12 +5,12 @@ since they seem to count as two characters and it's a bit strange.
"""
import re
-from ..helpers import add_surrogate, del_surrogate
+from ..helpers import add_surrogate, del_surrogate, strip_text
from ..tl import TLObject
from ..tl.types import (
MessageEntityBold, MessageEntityItalic, MessageEntityCode,
MessageEntityPre, MessageEntityTextUrl, MessageEntityMentionName
- )
+)
DEFAULT_DELIMITERS = {
'**': MessageEntityBold,
@@ -125,6 +125,7 @@ def parse(message, delimiters=None, url_re=None):
+ message[current.offset:]
)
+ message = strip_text(message, result)
return del_surrogate(message), result
diff --git a/telethon/helpers.py b/telethon/helpers.py
index 758344ae..05c9a625 100644
--- a/telethon/helpers.py
+++ b/telethon/helpers.py
@@ -33,6 +33,41 @@ def del_surrogate(text):
return text.encode('utf-16', 'surrogatepass').decode('utf-16')
+def strip_text(text, entities):
+ """
+ Strips whitespace from the given text modifying the provided entities.
+
+ This assumes that there are no overlapping entities, that their length
+ is greater or equal to one, and that their length is not out of bounds.
+ """
+ if not entities:
+ return text.strip()
+
+ while text and text[-1].isspace():
+ e = entities[-1]
+ if e.offset + e.length == len(text):
+ if e.length == 1:
+ del entities[-1]
+ if not entities:
+ return text.strip()
+ else:
+ e.length -= 1
+ text = text[:-1]
+
+ while text and text[0].isspace():
+ e = entities[0]
+ if e.offset == 0:
+ if e.length == 1:
+ del entities[0]
+ if not entities:
+ return text.lstrip()
+ else:
+ e.length -= 1
+ text = text[1:]
+
+ return text
+
+
# endregion
# region Cryptographic related utils