Fix unparsing text with malformed message entities

2025-07-22 05:49:45 +03:00 · 2019-12-19 15:48:27 +01:00 · 2019-12-19 15:48:27 +01:00 · f3111f93b2
commit f3111f93b2
parent ccbc1c669c
2 changed files with 27 additions and 3 deletions
--- a/telethon/extensions/html.py
+++ b/telethon/extensions/html.py
@ -168,9 +168,21 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0,
            continue

        skip_entity = False
-        entity_text = unparse(text=text[relative_offset:relative_offset + entity.length],
+        length = entity.length
+
+        # If we are in the middle of a surrogate nudge the position by +1.
+        # Otherwise we would end up with malformed text and fail to encode.
+        # For example of bad input: "Hi \ud83d\ude1c"
+        # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
+        if '\ud800' <= text[relative_offset] <= '\udfff':
+            relative_offset += 1
+
+        if '\ud800' <= text[relative_offset + length] <= '\udfff':
+            length += 1
+
+        entity_text = unparse(text=text[relative_offset:relative_offset + length],
                              entities=entities[i + 1:],
-                              _offset=entity.offset, _length=entity.length)
+                              _offset=entity.offset, _length=length)
        entity_type = type(entity)

        if entity_type == MessageEntityBold:
@ -208,6 +220,10 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0,
                        .format(entity.user_id, entity_text))
        else:
            skip_entity = True
-        last_offset = relative_offset + (0 if skip_entity else entity.length)
+        last_offset = relative_offset + (0 if skip_entity else length)
+
+    if last_offset < len(text) and '\ud800' <= text[last_offset] <= '\udfff':
+        last_offset += 1
+
    html.append(escape(text[last_offset:]))
    return _del_surrogate(''.join(html))
--- a/telethon/extensions/markdown.py
+++ b/telethon/extensions/markdown.py
@ -184,6 +184,14 @@ def unparse(text, entities, delimiters=None, url_fmt=None):
    insert_at.sort(key=lambda t: t[0])
    while insert_at:
        at, what = insert_at.pop()
+
+        # If we are in the middle of a surrogate nudge the position by +1.
+        # Otherwise we would end up with malformed text and fail to encode.
+        # For example of bad input: "Hi \ud83d\ude1c"
+        # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
+        if '\ud800' <= text[at] <= '\udfff':
+            at += 1
+
        text = text[:at] + what + text[at:]

    return del_surrogate(text)