From 046e2cb605e4def4d38c2f0d665ea49babb90093 Mon Sep 17 00:00:00 2001 From: Rongrong Date: Mon, 25 Jul 2022 17:11:26 +0800 Subject: [PATCH] Fix HTML/MD parser producing empty MessageEntity (#3885) Closes #3884. The implementation is also simplified. --- telethon/helpers.py | 88 +++++++++++++++++++++++----------- tests/telethon/test_helpers.py | 27 ++++++++++- 2 files changed, 86 insertions(+), 29 deletions(-) diff --git a/telethon/helpers.py b/telethon/helpers.py index 6c782b0b..aac058b0 100644 --- a/telethon/helpers.py +++ b/telethon/helpers.py @@ -64,40 +64,74 @@ def within_surrogate(text, index, *, length=None): def strip_text(text, entities): """ - Strips whitespace from the given text modifying the provided entities. + Strips whitespace from the given surrogated text modifying the provided + entities, also removing any empty (0-length) entities. - This assumes that there are no overlapping entities, that their length - is greater or equal to one, and that their length is not out of bounds. + This assumes that the length of entities is greater or equal to 0, and + that no entity is out of bounds. """ if not entities: return text.strip() - while text and text[-1].isspace(): - e = entities[-1] - if e.offset + e.length == len(text): - if e.length == 1: - del entities[-1] - if not entities: - return text.strip() + len_ori = len(text) + text = text.lstrip() + left_offset = len_ori - len(text) + text = text.rstrip() + len_final = len(text) + + for i in reversed(range(len(entities))): + e = entities[i] + if e.length == 0: + del entities[i] + continue + + if e.offset + e.length > left_offset: + if e.offset >= left_offset: + # 0 1|2 3 4 5 | 0 1|2 3 4 5 + # ^ ^ | ^ + # lo(2) o(5) | o(2)/lo(2) + e.offset -= left_offset + # |0 1 2 3 | |0 1 2 3 + # ^ | ^ + # o=o-lo(3=5-2) | o=o-lo(0=2-2) else: - e.length -= 1 - text = text[:-1] + # e.offset < left_offset and e.offset + e.length > left_offset + # 0 1 2 3|4 5 6 7 8 9 10 + # ^ ^ ^ + # o(1) lo(4) o+l(1+9) + e.length = e.offset + e.length - left_offset + e.offset = 0 + # |0 1 2 3 4 5 6 + # ^ ^ + # o(0) o+l=0+o+l-lo(6=0+6=0+1+9-4) + else: + # e.offset + e.length <= left_offset + # 0 1 2 3|4 5 + # ^ ^ + # o(0) o+l(4) + # lo(4) + del entities[i] + continue - while text and text[0].isspace(): - for i in reversed(range(len(entities))): - e = entities[i] - if e.offset != 0: - e.offset -= 1 - continue - - if e.length == 1: - del entities[0] - if not entities: - return text.lstrip() - else: - e.length -= 1 - - text = text[1:] + if e.offset + e.length <= len_final: + # |0 1 2 3 4 5 6 7 8 9 + # ^ ^ + # o(1) o+l(1+9)/lf(10) + continue + if e.offset >= len_final: + # |0 1 2 3 4 + # ^ + # o(5)/lf(5) + del entities[i] + else: + # e.offset < len_final and e.offset + e.length > len_final + # |0 1 2 3 4 5 (6) (7) (8) (9) + # ^ ^ ^ + # o(1) lf(6) o+l(1+8) + e.length = len_final - e.offset + # |0 1 2 3 4 5 + # ^ ^ + # o(1) o+l=o+lf-o=lf(6=1+5=1+6-1) return text diff --git a/tests/telethon/test_helpers.py b/tests/telethon/test_helpers.py index 689db8af..47705ca2 100644 --- a/tests/telethon/test_helpers.py +++ b/tests/telethon/test_helpers.py @@ -7,11 +7,34 @@ from base64 import b64decode import pytest from telethon import helpers +from telethon.utils import get_inner_text +from telethon.tl.types import MessageEntityUnknown as Meu def test_strip_text(): - assert helpers.strip_text(" text ", []) == "text" - # I can't interpret the rest of the code well enough yet + text = ' text ' + text_stripped = 'text' + entities_before_and_after = ( + ([], []), + ([Meu(i, 0) for i in range(10)], []), # del '' + ([Meu(0, 0), Meu(0, 1), Meu(5, 1)], []), # del '', ' ', ' ' + ([Meu(0, 3)], [Meu(0, 2)]), # ' te' -> 'te' + ([Meu(3, 1)], [Meu(2, 1)]), # 'x' + ([Meu(3, 2)], [Meu(2, 2)]), # 'xt' + ([Meu(3, 3)], [Meu(2, 2)]), # 'xt ' -> 'xt' + ([Meu(0, 6)], [Meu(0, 4)]), # ' text ' -> 'text' + ) + for entities_before, entities_expected in entities_before_and_after: + entities_for_test = [Meu(meu.offset, meu.length) for meu in entities_before] # deep copy + text_after = helpers.strip_text(text, entities_for_test) + assert text_after == text_stripped + assert sorted((e.offset, e.length) for e in entities_for_test) \ + == sorted((e.offset, e.length) for e in entities_expected) + inner_text_before = get_inner_text(text, entities_before) + inner_text_before_stripped = [t.strip() for t in inner_text_before] + inner_text_after = get_inner_text(text_after, entities_for_test) + for t in inner_text_after: + assert t in inner_text_before_stripped class TestSyncifyAsyncContext: