Fix HTML/MD parser producing empty MessageEntity (#3885)

Closes #3884. The implementation is also simplified.
This commit is contained in:
Rongrong 2022-07-25 17:11:26 +08:00 committed by GitHub
parent 066820900d
commit 046e2cb605
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 86 additions and 29 deletions

View File

@ -64,40 +64,74 @@ def within_surrogate(text, index, *, length=None):
def strip_text(text, entities): def strip_text(text, entities):
""" """
Strips whitespace from the given text modifying the provided entities. Strips whitespace from the given surrogated text modifying the provided
entities, also removing any empty (0-length) entities.
This assumes that there are no overlapping entities, that their length This assumes that the length of entities is greater or equal to 0, and
is greater or equal to one, and that their length is not out of bounds. that no entity is out of bounds.
""" """
if not entities: if not entities:
return text.strip() return text.strip()
while text and text[-1].isspace(): len_ori = len(text)
e = entities[-1] text = text.lstrip()
if e.offset + e.length == len(text): left_offset = len_ori - len(text)
if e.length == 1: text = text.rstrip()
del entities[-1] len_final = len(text)
if not entities:
return text.strip() for i in reversed(range(len(entities))):
e = entities[i]
if e.length == 0:
del entities[i]
continue
if e.offset + e.length > left_offset:
if e.offset >= left_offset:
# 0 1|2 3 4 5 | 0 1|2 3 4 5
# ^ ^ | ^
# lo(2) o(5) | o(2)/lo(2)
e.offset -= left_offset
# |0 1 2 3 | |0 1 2 3
# ^ | ^
# o=o-lo(3=5-2) | o=o-lo(0=2-2)
else: else:
e.length -= 1 # e.offset < left_offset and e.offset + e.length > left_offset
text = text[:-1] # 0 1 2 3|4 5 6 7 8 9 10
# ^ ^ ^
# o(1) lo(4) o+l(1+9)
e.length = e.offset + e.length - left_offset
e.offset = 0
# |0 1 2 3 4 5 6
# ^ ^
# o(0) o+l=0+o+l-lo(6=0+6=0+1+9-4)
else:
# e.offset + e.length <= left_offset
# 0 1 2 3|4 5
# ^ ^
# o(0) o+l(4)
# lo(4)
del entities[i]
continue
while text and text[0].isspace(): if e.offset + e.length <= len_final:
for i in reversed(range(len(entities))): # |0 1 2 3 4 5 6 7 8 9
e = entities[i] # ^ ^
if e.offset != 0: # o(1) o+l(1+9)/lf(10)
e.offset -= 1 continue
continue if e.offset >= len_final:
# |0 1 2 3 4
if e.length == 1: # ^
del entities[0] # o(5)/lf(5)
if not entities: del entities[i]
return text.lstrip() else:
else: # e.offset < len_final and e.offset + e.length > len_final
e.length -= 1 # |0 1 2 3 4 5 (6) (7) (8) (9)
# ^ ^ ^
text = text[1:] # o(1) lf(6) o+l(1+8)
e.length = len_final - e.offset
# |0 1 2 3 4 5
# ^ ^
# o(1) o+l=o+lf-o=lf(6=1+5=1+6-1)
return text return text

View File

@ -7,11 +7,34 @@ from base64 import b64decode
import pytest import pytest
from telethon import helpers from telethon import helpers
from telethon.utils import get_inner_text
from telethon.tl.types import MessageEntityUnknown as Meu
def test_strip_text(): def test_strip_text():
assert helpers.strip_text(" text ", []) == "text" text = ' text '
# I can't interpret the rest of the code well enough yet text_stripped = 'text'
entities_before_and_after = (
([], []),
([Meu(i, 0) for i in range(10)], []), # del ''
([Meu(0, 0), Meu(0, 1), Meu(5, 1)], []), # del '', ' ', ' '
([Meu(0, 3)], [Meu(0, 2)]), # ' te' -> 'te'
([Meu(3, 1)], [Meu(2, 1)]), # 'x'
([Meu(3, 2)], [Meu(2, 2)]), # 'xt'
([Meu(3, 3)], [Meu(2, 2)]), # 'xt ' -> 'xt'
([Meu(0, 6)], [Meu(0, 4)]), # ' text ' -> 'text'
)
for entities_before, entities_expected in entities_before_and_after:
entities_for_test = [Meu(meu.offset, meu.length) for meu in entities_before] # deep copy
text_after = helpers.strip_text(text, entities_for_test)
assert text_after == text_stripped
assert sorted((e.offset, e.length) for e in entities_for_test) \
== sorted((e.offset, e.length) for e in entities_expected)
inner_text_before = get_inner_text(text, entities_before)
inner_text_before_stripped = [t.strip() for t in inner_text_before]
inner_text_after = get_inner_text(text_after, entities_for_test)
for t in inner_text_after:
assert t in inner_text_before_stripped
class TestSyncifyAsyncContext: class TestSyncifyAsyncContext: