Fix HTML/MD parser producing empty MessageEntity (#3885)

Closes #3884. The implementation is also simplified.
2025-02-23 15:02:56 +03:00 · 2022-07-25 17:11:26 +08:00 · 2022-07-25 17:11:26 +08:00 · 046e2cb605
commit 046e2cb605
parent 066820900d
2 changed files with 86 additions and 29 deletions
--- a/telethon/helpers.py
+++ b/telethon/helpers.py
@ -64,40 +64,74 @@ def within_surrogate(text, index, *, length=None):

 def strip_text(text, entities):
    """
-    Strips whitespace from the given text modifying the provided entities.
+    Strips whitespace from the given surrogated text modifying the provided
+    entities, also removing any empty (0-length) entities.

-    This assumes that there are no overlapping entities, that their length
-    is greater or equal to one, and that their length is not out of bounds.
+    This assumes that the length of entities is greater or equal to 0, and
+    that no entity is out of bounds.
    """
    if not entities:
        return text.strip()

-    while text and text[-1].isspace():
-        e = entities[-1]
-        if e.offset + e.length == len(text):
-            if e.length == 1:
-                del entities[-1]
-                if not entities:
-                    return text.strip()
+    len_ori = len(text)
+    text = text.lstrip()
+    left_offset = len_ori - len(text)
+    text = text.rstrip()
+    len_final = len(text)
+
+    for i in reversed(range(len(entities))):
+        e = entities[i]
+        if e.length == 0:
+            del entities[i]
+            continue
+
+        if e.offset + e.length > left_offset:
+            if e.offset >= left_offset:
+                #  0 1|2 3 4 5       |       0 1|2 3 4 5
+                #     ^     ^        |          ^
+                #   lo(2)  o(5)      |      o(2)/lo(2)
+                e.offset -= left_offset
+                #     |0 1 2 3       |          |0 1 2 3
+                #           ^        |          ^
+                #     o=o-lo(3=5-2)  |    o=o-lo(0=2-2)
            else:
-                e.length -= 1
-        text = text[:-1]
+                # e.offset < left_offset and e.offset + e.length > left_offset
+                #  0 1 2 3|4 5 6 7 8 9 10
+                #   ^     ^           ^
+                #  o(1) lo(4)      o+l(1+9)
+                e.length = e.offset + e.length - left_offset
+                e.offset = 0
+                #         |0 1 2 3 4 5 6
+                #         ^           ^
+                #        o(0)  o+l=0+o+l-lo(6=0+6=0+1+9-4)
+        else:
+            # e.offset + e.length <= left_offset
+            #   0 1 2 3|4 5
+            #  ^       ^
+            # o(0)   o+l(4)
+            #        lo(4)
+            del entities[i]
+            continue

-    while text and text[0].isspace():
-        for i in reversed(range(len(entities))):
-            e = entities[i]
-            if e.offset != 0:
-                e.offset -= 1
-                continue
-
-            if e.length == 1:
-                del entities[0]
-                if not entities:
-                    return text.lstrip()
-            else:
-                e.length -= 1
-
-        text = text[1:]
+        if e.offset + e.length <= len_final:
+            # |0 1 2 3 4 5 6 7 8 9
+            #   ^                 ^
+            #  o(1)       o+l(1+9)/lf(10)
+            continue
+        if e.offset >= len_final:
+            # |0 1 2 3 4
+            #           ^
+            #       o(5)/lf(5)
+            del entities[i]
+        else:
+            # e.offset < len_final and e.offset + e.length > len_final
+            # |0 1 2 3 4 5 (6) (7) (8) (9)
+            #   ^         ^           ^
+            #  o(1)     lf(6)      o+l(1+8)
+            e.length = len_final - e.offset
+            # |0 1 2 3 4 5
+            #   ^         ^
+            #  o(1) o+l=o+lf-o=lf(6=1+5=1+6-1)

    return text

--- a/tests/telethon/test_helpers.py
+++ b/tests/telethon/test_helpers.py
@ -7,11 +7,34 @@ from base64 import b64decode
 import pytest

 from telethon import helpers
+from telethon.utils import get_inner_text
+from telethon.tl.types import MessageEntityUnknown as Meu


 def test_strip_text():
-    assert helpers.strip_text(" text ", []) == "text"
-    # I can't interpret the rest of the code well enough yet
+    text = ' text '
+    text_stripped = 'text'
+    entities_before_and_after = (
+        ([], []),
+        ([Meu(i, 0) for i in range(10)], []),  # del ''
+        ([Meu(0, 0), Meu(0, 1), Meu(5, 1)], []),  # del '', ' ', ' '
+        ([Meu(0, 3)], [Meu(0, 2)]),  # ' te' -> 'te'
+        ([Meu(3, 1)], [Meu(2, 1)]),  # 'x'
+        ([Meu(3, 2)], [Meu(2, 2)]),  # 'xt'
+        ([Meu(3, 3)], [Meu(2, 2)]),  # 'xt ' -> 'xt'
+        ([Meu(0, 6)], [Meu(0, 4)]),  # ' text ' -> 'text'
+    )
+    for entities_before, entities_expected in entities_before_and_after:
+        entities_for_test = [Meu(meu.offset, meu.length) for meu in entities_before]  # deep copy
+        text_after = helpers.strip_text(text, entities_for_test)
+        assert text_after == text_stripped
+        assert sorted((e.offset, e.length) for e in entities_for_test) \
+               == sorted((e.offset, e.length) for e in entities_expected)
+        inner_text_before = get_inner_text(text, entities_before)
+        inner_text_before_stripped = [t.strip() for t in inner_text_before]
+        inner_text_after = get_inner_text(text_after, entities_for_test)
+        for t in inner_text_after:
+            assert t in inner_text_before_stripped


 class TestSyncifyAsyncContext: