Fix HTML/MD parser producing empty MessageEntity (#3885)

Closes #3884. The implementation is also simplified.
This commit is contained in:
Rongrong 2022-07-25 17:11:26 +08:00 committed by GitHub
parent 066820900d
commit 046e2cb605
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 86 additions and 29 deletions

View File

@ -64,40 +64,74 @@ def within_surrogate(text, index, *, length=None):
def strip_text(text, entities):
"""
Strips whitespace from the given text modifying the provided entities.
Strips whitespace from the given surrogated text modifying the provided
entities, also removing any empty (0-length) entities.
This assumes that there are no overlapping entities, that their length
is greater or equal to one, and that their length is not out of bounds.
This assumes that the length of entities is greater or equal to 0, and
that no entity is out of bounds.
"""
if not entities:
return text.strip()
while text and text[-1].isspace():
e = entities[-1]
if e.offset + e.length == len(text):
if e.length == 1:
del entities[-1]
if not entities:
return text.strip()
else:
e.length -= 1
text = text[:-1]
len_ori = len(text)
text = text.lstrip()
left_offset = len_ori - len(text)
text = text.rstrip()
len_final = len(text)
while text and text[0].isspace():
for i in reversed(range(len(entities))):
e = entities[i]
if e.offset != 0:
e.offset -= 1
if e.length == 0:
del entities[i]
continue
if e.length == 1:
del entities[0]
if not entities:
return text.lstrip()
if e.offset + e.length > left_offset:
if e.offset >= left_offset:
# 0 1|2 3 4 5 | 0 1|2 3 4 5
# ^ ^ | ^
# lo(2) o(5) | o(2)/lo(2)
e.offset -= left_offset
# |0 1 2 3 | |0 1 2 3
# ^ | ^
# o=o-lo(3=5-2) | o=o-lo(0=2-2)
else:
e.length -= 1
# e.offset < left_offset and e.offset + e.length > left_offset
# 0 1 2 3|4 5 6 7 8 9 10
# ^ ^ ^
# o(1) lo(4) o+l(1+9)
e.length = e.offset + e.length - left_offset
e.offset = 0
# |0 1 2 3 4 5 6
# ^ ^
# o(0) o+l=0+o+l-lo(6=0+6=0+1+9-4)
else:
# e.offset + e.length <= left_offset
# 0 1 2 3|4 5
# ^ ^
# o(0) o+l(4)
# lo(4)
del entities[i]
continue
text = text[1:]
if e.offset + e.length <= len_final:
# |0 1 2 3 4 5 6 7 8 9
# ^ ^
# o(1) o+l(1+9)/lf(10)
continue
if e.offset >= len_final:
# |0 1 2 3 4
# ^
# o(5)/lf(5)
del entities[i]
else:
# e.offset < len_final and e.offset + e.length > len_final
# |0 1 2 3 4 5 (6) (7) (8) (9)
# ^ ^ ^
# o(1) lf(6) o+l(1+8)
e.length = len_final - e.offset
# |0 1 2 3 4 5
# ^ ^
# o(1) o+l=o+lf-o=lf(6=1+5=1+6-1)
return text

View File

@ -7,11 +7,34 @@ from base64 import b64decode
import pytest
from telethon import helpers
from telethon.utils import get_inner_text
from telethon.tl.types import MessageEntityUnknown as Meu
def test_strip_text():
assert helpers.strip_text(" text ", []) == "text"
# I can't interpret the rest of the code well enough yet
text = ' text '
text_stripped = 'text'
entities_before_and_after = (
([], []),
([Meu(i, 0) for i in range(10)], []), # del ''
([Meu(0, 0), Meu(0, 1), Meu(5, 1)], []), # del '', ' ', ' '
([Meu(0, 3)], [Meu(0, 2)]), # ' te' -> 'te'
([Meu(3, 1)], [Meu(2, 1)]), # 'x'
([Meu(3, 2)], [Meu(2, 2)]), # 'xt'
([Meu(3, 3)], [Meu(2, 2)]), # 'xt ' -> 'xt'
([Meu(0, 6)], [Meu(0, 4)]), # ' text ' -> 'text'
)
for entities_before, entities_expected in entities_before_and_after:
entities_for_test = [Meu(meu.offset, meu.length) for meu in entities_before] # deep copy
text_after = helpers.strip_text(text, entities_for_test)
assert text_after == text_stripped
assert sorted((e.offset, e.length) for e in entities_for_test) \
== sorted((e.offset, e.length) for e in entities_expected)
inner_text_before = get_inner_text(text, entities_before)
inner_text_before_stripped = [t.strip() for t in inner_text_before]
inner_text_after = get_inner_text(text_after, entities_for_test)
for t in inner_text_after:
assert t in inner_text_before_stripped
class TestSyncifyAsyncContext: