diff --git a/telethon/extensions/html.py b/telethon/extensions/html.py index ef5e8a34..dc634318 100644 --- a/telethon/extensions/html.py +++ b/telethon/extensions/html.py @@ -168,9 +168,21 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0, continue skip_entity = False - entity_text = unparse(text=text[relative_offset:relative_offset + entity.length], + length = entity.length + + # If we are in the middle of a surrogate nudge the position by +1. + # Otherwise we would end up with malformed text and fail to encode. + # For example of bad input: "Hi \ud83d\ude1c" + # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF + if '\ud800' <= text[relative_offset] <= '\udfff': + relative_offset += 1 + + if '\ud800' <= text[relative_offset + length] <= '\udfff': + length += 1 + + entity_text = unparse(text=text[relative_offset:relative_offset + length], entities=entities[i + 1:], - _offset=entity.offset, _length=entity.length) + _offset=entity.offset, _length=length) entity_type = type(entity) if entity_type == MessageEntityBold: @@ -208,6 +220,10 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0, .format(entity.user_id, entity_text)) else: skip_entity = True - last_offset = relative_offset + (0 if skip_entity else entity.length) + last_offset = relative_offset + (0 if skip_entity else length) + + if last_offset < len(text) and '\ud800' <= text[last_offset] <= '\udfff': + last_offset += 1 + html.append(escape(text[last_offset:])) return _del_surrogate(''.join(html)) diff --git a/telethon/extensions/markdown.py b/telethon/extensions/markdown.py index 2beb3d91..d127ca18 100644 --- a/telethon/extensions/markdown.py +++ b/telethon/extensions/markdown.py @@ -184,6 +184,14 @@ def unparse(text, entities, delimiters=None, url_fmt=None): insert_at.sort(key=lambda t: t[0]) while insert_at: at, what = insert_at.pop() + + # If we are in the middle of a surrogate nudge the position by +1. + # Otherwise we would end up with malformed text and fail to encode. + # For example of bad input: "Hi \ud83d\ude1c" + # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF + if '\ud800' <= text[at] <= '\udfff': + at += 1 + text = text[:at] + what + text[at:] return del_surrogate(text)