Fix within surrogate detection

2025-08-02 03:00:15 +03:00 · 2020-02-20 10:53:28 +01:00 · 2020-02-20 10:53:28 +01:00 · 3d32e16235
commit 3d32e16235
parent 3a6c955c90
5 changed files with 48 additions and 12 deletions
--- a/telethon/extensions/html.py
+++ b/telethon/extensions/html.py
@ -174,12 +174,10 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0,
        # Otherwise we would end up with malformed text and fail to encode.
        # For example of bad input: "Hi \ud83d\ude1c"
        # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
-        while (relative_offset < _length
-                and '\ud800' <= text[relative_offset] <= '\udfff'):
+        while helpers.within_surrogate(text, relative_offset, length=_length):
            relative_offset += 1

-        while (relative_offset + length < _length
-                and '\ud800' <= text[relative_offset + length] <= '\udfff'):
+        while helpers.within_surrogate(text, relative_offset + length, length=_length):
            length += 1

        entity_text = unparse(text=text[relative_offset:relative_offset + length],
@ -224,7 +222,7 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0,
            skip_entity = True
        last_offset = relative_offset + (0 if skip_entity else length)

-    while last_offset < _length and '\ud800' <= text[last_offset] <= '\udfff':
+    while helpers.within_surrogate(text, last_offset, length=_length):
        last_offset += 1

    html.append(escape(text[last_offset:]))
--- a/telethon/extensions/markdown.py
+++ b/telethon/extensions/markdown.py
@ -6,7 +6,7 @@ since they seem to count as two characters and it's a bit strange.
 import re
 import warnings

-from ..helpers import add_surrogate, del_surrogate, strip_text
+from ..helpers import add_surrogate, del_surrogate, within_surrogate, strip_text
 from ..tl import TLObject
 from ..tl.types import (
    MessageEntityBold, MessageEntityItalic, MessageEntityCode,
@ -185,11 +185,11 @@ def unparse(text, entities, delimiters=None, url_fmt=None):
    while insert_at:
        at, what = insert_at.pop()

-        # If we are in the middle of a surrogate nudge the position by +1.
+        # If we are in the middle of a surrogate nudge the position by -1.
        # Otherwise we would end up with malformed text and fail to encode.
        # For example of bad input: "Hi \ud83d\ude1c"
        # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
-        while at < len(text) and '\ud800' <= text[at] <= '\udfff':
+        while within_surrogate(text, at):
            at += 1

        text = text[:at] + what + text[at:]
--- a/telethon/helpers.py
+++ b/telethon/helpers.py
@ -40,6 +40,20 @@ def del_surrogate(text):
    return text.encode('utf-16', 'surrogatepass').decode('utf-16')


+def within_surrogate(text, index, *, length=None):
+    """
+    `True` if ``index`` is within a surrogate (before and after it, not at!).
+    """
+    if length is None:
+        length = len(text)
+
+    return (
+            1 < index < len(text) and  # in bounds
+            '\ud800' <= text[index - 1] <= '\udfff' and  # previous is
+            '\ud800' <= text[index] <= '\udfff'  # current is
+    )
+
+
 def strip_text(text, entities):
    """
    Strips whitespace from the given text modifying the provided entities.
--- a/tests/telethon/extensions/test_html.py
+++ b/tests/telethon/extensions/test_html.py
@ -23,7 +23,7 @@ def test_malformed_entities():
    text = '🏆Telegram Official Android Challenge is over🏆.'
    entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')]
    result = html.unparse(text, entities)
-    assert result == '🏆<a href="https://example.com">Telegram Official Android Challenge is over🏆</a>.'
+    assert result == '🏆<a href="https://example.com">Telegram Official Android Challenge is over</a>🏆.'


 def test_trailing_malformed_entities():
@ -35,7 +35,7 @@ def test_trailing_malformed_entities():
    text = '🏆Telegram Official Android Challenge is over🏆'
    entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')]
    result = html.unparse(text, entities)
-    assert result == '🏆<a href="https://example.com">Telegram Official Android Challenge is over🏆</a>'
+    assert result == '🏆<a href="https://example.com">Telegram Official Android Challenge is over</a>🏆'


 def test_entities_together():
@ -51,3 +51,15 @@ def test_entities_together():

    text = html.unparse(text, entities)
    assert text == original
+
+
+def test_offset_at_emoji():
+    """
+    Tests that an entity starting at a emoji preserves the emoji.
+    """
+    text = 'Hi\n👉 See example'
+    entities = [MessageEntityBold(0, 2), MessageEntityItalic(3, 2), MessageEntityBold(10, 7)]
+    parsed = '<strong>Hi</strong>\n<em>👉</em> See <strong>example</strong>'
+
+    assert html.parse(parsed) == (text, entities)
+    assert html.unparse(text, entities) == parsed
--- a/tests/telethon/extensions/test_markdown.py
+++ b/tests/telethon/extensions/test_markdown.py
@ -23,7 +23,7 @@ def test_malformed_entities():
    text = '🏆Telegram Official Android Challenge is over🏆.'
    entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')]
    result = markdown.unparse(text, entities)
-    assert result == "🏆[Telegram Official Android Challenge is over🏆](https://example.com)."
+    assert result == "🏆[Telegram Official Android Challenge is over](https://example.com)🏆."


 def test_trailing_malformed_entities():
@ -35,7 +35,7 @@ def test_trailing_malformed_entities():
    text = '🏆Telegram Official Android Challenge is over🏆'
    entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')]
    result = markdown.unparse(text, entities)
-    assert result == "🏆[Telegram Official Android Challenge is over🏆](https://example.com)"
+    assert result == "🏆[Telegram Official Android Challenge is over](https://example.com)🏆"


 def test_entities_together():
@ -51,3 +51,15 @@ def test_entities_together():

    text = markdown.unparse(text, entities)
    assert text == original
+
+
+def test_offset_at_emoji():
+    """
+    Tests that an entity starting at a emoji preserves the emoji.
+    """
+    text = 'Hi\n👉 See example'
+    entities = [MessageEntityBold(0, 2), MessageEntityItalic(3, 2), MessageEntityBold(10, 7)]
+    parsed = '**Hi**\n__👉__ See **example**'
+
+    assert markdown.parse(parsed) == (text, entities)
+    assert markdown.unparse(text, entities) == parsed