From 3d32e16235b0003c01bf73c75530a09d51926ec9 Mon Sep 17 00:00:00 2001 From: Lonami Exo Date: Thu, 20 Feb 2020 10:53:28 +0100 Subject: [PATCH] Fix within surrogate detection --- telethon/extensions/html.py | 8 +++----- telethon/extensions/markdown.py | 6 +++--- telethon/helpers.py | 14 ++++++++++++++ tests/telethon/extensions/test_html.py | 16 ++++++++++++++-- tests/telethon/extensions/test_markdown.py | 16 ++++++++++++++-- 5 files changed, 48 insertions(+), 12 deletions(-) diff --git a/telethon/extensions/html.py b/telethon/extensions/html.py index 9732b615..62e622ba 100644 --- a/telethon/extensions/html.py +++ b/telethon/extensions/html.py @@ -174,12 +174,10 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0, # Otherwise we would end up with malformed text and fail to encode. # For example of bad input: "Hi \ud83d\ude1c" # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF - while (relative_offset < _length - and '\ud800' <= text[relative_offset] <= '\udfff'): + while helpers.within_surrogate(text, relative_offset, length=_length): relative_offset += 1 - while (relative_offset + length < _length - and '\ud800' <= text[relative_offset + length] <= '\udfff'): + while helpers.within_surrogate(text, relative_offset + length, length=_length): length += 1 entity_text = unparse(text=text[relative_offset:relative_offset + length], @@ -224,7 +222,7 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0, skip_entity = True last_offset = relative_offset + (0 if skip_entity else length) - while last_offset < _length and '\ud800' <= text[last_offset] <= '\udfff': + while helpers.within_surrogate(text, last_offset, length=_length): last_offset += 1 html.append(escape(text[last_offset:])) diff --git a/telethon/extensions/markdown.py b/telethon/extensions/markdown.py index 480d633d..f6d59106 100644 --- a/telethon/extensions/markdown.py +++ b/telethon/extensions/markdown.py @@ -6,7 +6,7 @@ since they seem to count as two characters and it's a bit strange. import re import warnings -from ..helpers import add_surrogate, del_surrogate, strip_text +from ..helpers import add_surrogate, del_surrogate, within_surrogate, strip_text from ..tl import TLObject from ..tl.types import ( MessageEntityBold, MessageEntityItalic, MessageEntityCode, @@ -185,11 +185,11 @@ def unparse(text, entities, delimiters=None, url_fmt=None): while insert_at: at, what = insert_at.pop() - # If we are in the middle of a surrogate nudge the position by +1. + # If we are in the middle of a surrogate nudge the position by -1. # Otherwise we would end up with malformed text and fail to encode. # For example of bad input: "Hi \ud83d\ude1c" # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF - while at < len(text) and '\ud800' <= text[at] <= '\udfff': + while within_surrogate(text, at): at += 1 text = text[:at] + what + text[at:] diff --git a/telethon/helpers.py b/telethon/helpers.py index 4c8d4799..1b5f8843 100644 --- a/telethon/helpers.py +++ b/telethon/helpers.py @@ -40,6 +40,20 @@ def del_surrogate(text): return text.encode('utf-16', 'surrogatepass').decode('utf-16') +def within_surrogate(text, index, *, length=None): + """ + `True` if ``index`` is within a surrogate (before and after it, not at!). + """ + if length is None: + length = len(text) + + return ( + 1 < index < len(text) and # in bounds + '\ud800' <= text[index - 1] <= '\udfff' and # previous is + '\ud800' <= text[index] <= '\udfff' # current is + ) + + def strip_text(text, entities): """ Strips whitespace from the given text modifying the provided entities. diff --git a/tests/telethon/extensions/test_html.py b/tests/telethon/extensions/test_html.py index ee497321..59d96e0d 100644 --- a/tests/telethon/extensions/test_html.py +++ b/tests/telethon/extensions/test_html.py @@ -23,7 +23,7 @@ def test_malformed_entities(): text = '๐Ÿ†Telegram Official Android Challenge is over๐Ÿ†.' entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')] result = html.unparse(text, entities) - assert result == '๐Ÿ†Telegram Official Android Challenge is over๐Ÿ†.' + assert result == '๐Ÿ†Telegram Official Android Challenge is over๐Ÿ†.' def test_trailing_malformed_entities(): @@ -35,7 +35,7 @@ def test_trailing_malformed_entities(): text = '๐Ÿ†Telegram Official Android Challenge is over๐Ÿ†' entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')] result = html.unparse(text, entities) - assert result == '๐Ÿ†Telegram Official Android Challenge is over๐Ÿ†' + assert result == '๐Ÿ†Telegram Official Android Challenge is over๐Ÿ†' def test_entities_together(): @@ -51,3 +51,15 @@ def test_entities_together(): text = html.unparse(text, entities) assert text == original + + +def test_offset_at_emoji(): + """ + Tests that an entity starting at a emoji preserves the emoji. + """ + text = 'Hi\n๐Ÿ‘‰ See example' + entities = [MessageEntityBold(0, 2), MessageEntityItalic(3, 2), MessageEntityBold(10, 7)] + parsed = 'Hi\n๐Ÿ‘‰ See example' + + assert html.parse(parsed) == (text, entities) + assert html.unparse(text, entities) == parsed diff --git a/tests/telethon/extensions/test_markdown.py b/tests/telethon/extensions/test_markdown.py index 2f263644..bd78e4d8 100644 --- a/tests/telethon/extensions/test_markdown.py +++ b/tests/telethon/extensions/test_markdown.py @@ -23,7 +23,7 @@ def test_malformed_entities(): text = '๐Ÿ†Telegram Official Android Challenge is over๐Ÿ†.' entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')] result = markdown.unparse(text, entities) - assert result == "๐Ÿ†[Telegram Official Android Challenge is over๐Ÿ†](https://example.com)." + assert result == "๐Ÿ†[Telegram Official Android Challenge is over](https://example.com)๐Ÿ†." def test_trailing_malformed_entities(): @@ -35,7 +35,7 @@ def test_trailing_malformed_entities(): text = '๐Ÿ†Telegram Official Android Challenge is over๐Ÿ†' entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')] result = markdown.unparse(text, entities) - assert result == "๐Ÿ†[Telegram Official Android Challenge is over๐Ÿ†](https://example.com)" + assert result == "๐Ÿ†[Telegram Official Android Challenge is over](https://example.com)๐Ÿ†" def test_entities_together(): @@ -51,3 +51,15 @@ def test_entities_together(): text = markdown.unparse(text, entities) assert text == original + + +def test_offset_at_emoji(): + """ + Tests that an entity starting at a emoji preserves the emoji. + """ + text = 'Hi\n๐Ÿ‘‰ See example' + entities = [MessageEntityBold(0, 2), MessageEntityItalic(3, 2), MessageEntityBold(10, 7)] + parsed = '**Hi**\n__๐Ÿ‘‰__ See **example**' + + assert markdown.parse(parsed) == (text, entities) + assert markdown.unparse(text, entities) == parsed