Locally strip outgoing message text respecting entities

2025-07-02 03:13:10 +03:00 · 2018-11-19 10:15:56 +01:00 · 2018-11-19 10:15:56 +01:00 · aaee092a46
commit aaee092a46
parent d854babf22
3 changed files with 42 additions and 4 deletions
--- a/telethon/extensions/html.py
+++ b/telethon/extensions/html.py
@ -6,11 +6,12 @@ from collections import deque
 from html import escape, unescape
 from html.parser import HTMLParser

+from .. import helpers
 from ..tl.types import (
    MessageEntityBold, MessageEntityItalic, MessageEntityCode,
    MessageEntityPre, MessageEntityEmail, MessageEntityUrl,
    MessageEntityTextUrl, MessageEntityMentionName
-    )
+)


 # Helpers from markdown.py
@ -126,7 +127,8 @@ def parse(html):

    parser = HTMLToTelegramParser()
    parser.feed(_add_surrogate(html))
-    return _del_surrogate(parser.text), parser.entities
+    text = helpers.strip_text(parser.text, parser.entities)
+    return _del_surrogate(text), parser.entities


 def unparse(text, entities):
--- a/telethon/extensions/markdown.py
+++ b/telethon/extensions/markdown.py
@ -5,12 +5,12 @@ since they seem to count as two characters and it's a bit strange.
 """
 import re

-from ..helpers import add_surrogate, del_surrogate
+from ..helpers import add_surrogate, del_surrogate, strip_text
 from ..tl import TLObject
 from ..tl.types import (
    MessageEntityBold, MessageEntityItalic, MessageEntityCode,
    MessageEntityPre, MessageEntityTextUrl, MessageEntityMentionName
-    )
+)

 DEFAULT_DELIMITERS = {
    '**': MessageEntityBold,
@ -125,6 +125,7 @@ def parse(message, delimiters=None, url_re=None):
            + message[current.offset:]
        )

+    message = strip_text(message, result)
    return del_surrogate(message), result


--- a/telethon/helpers.py
+++ b/telethon/helpers.py
@ -33,6 +33,41 @@ def del_surrogate(text):
    return text.encode('utf-16', 'surrogatepass').decode('utf-16')


+def strip_text(text, entities):
+    """
+    Strips whitespace from the given text modifying the provided entities.
+
+    This assumes that there are no overlapping entities, that their length
+    is greater or equal to one, and that their length is not out of bounds.
+    """
+    if not entities:
+        return text.strip()
+
+    while text and text[-1].isspace():
+        e = entities[-1]
+        if e.offset + e.length == len(text):
+            if e.length == 1:
+                del entities[-1]
+                if not entities:
+                    return text.strip()
+            else:
+                e.length -= 1
+        text = text[:-1]
+
+    while text and text[0].isspace():
+        e = entities[0]
+        if e.offset == 0:
+            if e.length == 1:
+                del entities[0]
+                if not entities:
+                    return text.lstrip()
+            else:
+                e.length -= 1
+        text = text[1:]
+
+    return text
+
+
 # endregion

 # region Cryptographic related utils