Fix HTML/MD parser producing empty MessageEntity (#3885)

Closes #3884. The implementation is also simplified.
2025-12-04 16:53:55 +03:00 · 2022-07-25 17:11:26 +08:00 · 2022-07-25 17:11:26 +08:00 · 046e2cb605
commit 046e2cb605
parent 066820900d
2 changed files with 86 additions and 29 deletions
--- a/telethon/helpers.py
+++ b/telethon/helpers.py
@ -64,40 +64,74 @@ def within_surrogate(text, index, *, length=None):
 def strip_text(text, entities):
    """
-    Strips whitespace from the given text modifying the provided entities.
+    Strips whitespace from the given surrogated text modifying the provided
    entities, also removing any empty (0-length) entities.
-    This assumes that there are no overlapping entities, that their length
+    This assumes that the length of entities is greater or equal to 0, and
-    is greater or equal to one, and that their length is not out of bounds.
+    that no entity is out of bounds.
    """
    if not entities:
        return text.strip()
-    while text and text[-1].isspace():
+    len_ori = len(text)
-        e = entities[-1]
+    text = text.lstrip()
-        if e.offset + e.length == len(text):
+    left_offset = len_ori - len(text)
-            if e.length == 1:
+    text = text.rstrip()
-                del entities[-1]
+    len_final = len(text)
-                if not entities:
+
-                    return text.strip()
+    for i in reversed(range(len(entities))):
        e = entities[i]
        if e.length == 0:
            del entities[i]
            continue
        if e.offset + e.length > left_offset:
            if e.offset >= left_offset:
                #  0 1|2 3 4 5       |       0 1|2 3 4 5
                #     ^     ^        |          ^
                #   lo(2)  o(5)      |      o(2)/lo(2)
                e.offset -= left_offset
                #     |0 1 2 3       |          |0 1 2 3
                #           ^        |          ^
                #     o=o-lo(3=5-2)  |    o=o-lo(0=2-2)
            else:
-                e.length -= 1
+                # e.offset < left_offset and e.offset + e.length > left_offset
-        text = text[:-1]
+                #  0 1 2 3|4 5 6 7 8 9 10
                #   ^     ^           ^
                #  o(1) lo(4)      o+l(1+9)
                e.length = e.offset + e.length - left_offset
                e.offset = 0
                #         |0 1 2 3 4 5 6
                #         ^           ^
                #        o(0)  o+l=0+o+l-lo(6=0+6=0+1+9-4)
        else:
            # e.offset + e.length <= left_offset
            #   0 1 2 3|4 5
            #  ^       ^
            # o(0)   o+l(4)
            #        lo(4)
            del entities[i]
            continue
-    while text and text[0].isspace():
+        if e.offset + e.length <= len_final:
-        for i in reversed(range(len(entities))):
+            # |0 1 2 3 4 5 6 7 8 9
-            e = entities[i]
+            #   ^                 ^
-            if e.offset != 0:
+            #  o(1)       o+l(1+9)/lf(10)
-                e.offset -= 1
+            continue
-                continue
+        if e.offset >= len_final:
-
+            # |0 1 2 3 4
-            if e.length == 1:
+            #           ^
-                del entities[0]
+            #       o(5)/lf(5)
-                if not entities:
+            del entities[i]
-                    return text.lstrip()
+        else:
-            else:
+            # e.offset < len_final and e.offset + e.length > len_final
-                e.length -= 1
+            # |0 1 2 3 4 5 (6) (7) (8) (9)
-
+            #   ^         ^           ^
-        text = text[1:]
+            #  o(1)     lf(6)      o+l(1+8)
            e.length = len_final - e.offset
            # |0 1 2 3 4 5
            #   ^         ^
            #  o(1) o+l=o+lf-o=lf(6=1+5=1+6-1)
    return text
--- a/tests/telethon/test_helpers.py
+++ b/tests/telethon/test_helpers.py
@ -7,11 +7,34 @@ from base64 import b64decode
 import pytest
 from telethon import helpers
 from telethon.utils import get_inner_text
 from telethon.tl.types import MessageEntityUnknown as Meu
 def test_strip_text():
-    assert helpers.strip_text(" text ", []) == "text"
+    text = ' text '
-    # I can't interpret the rest of the code well enough yet
+    text_stripped = 'text'
    entities_before_and_after = (
        ([], []),
        ([Meu(i, 0) for i in range(10)], []),  # del ''
        ([Meu(0, 0), Meu(0, 1), Meu(5, 1)], []),  # del '', ' ', ' '
        ([Meu(0, 3)], [Meu(0, 2)]),  # ' te' -> 'te'
        ([Meu(3, 1)], [Meu(2, 1)]),  # 'x'
        ([Meu(3, 2)], [Meu(2, 2)]),  # 'xt'
        ([Meu(3, 3)], [Meu(2, 2)]),  # 'xt ' -> 'xt'
        ([Meu(0, 6)], [Meu(0, 4)]),  # ' text ' -> 'text'
    )
    for entities_before, entities_expected in entities_before_and_after:
        entities_for_test = [Meu(meu.offset, meu.length) for meu in entities_before]  # deep copy
        text_after = helpers.strip_text(text, entities_for_test)
        assert text_after == text_stripped
        assert sorted((e.offset, e.length) for e in entities_for_test) \
               == sorted((e.offset, e.length) for e in entities_expected)
        inner_text_before = get_inner_text(text, entities_before)
        inner_text_before_stripped = [t.strip() for t in inner_text_before]
        inner_text_after = get_inner_text(text_after, entities_for_test)
        for t in inner_text_after:
            assert t in inner_text_before_stripped
 class TestSyncifyAsyncContext: