mirror of
https://github.com/LonamiWebs/Telethon.git
synced 2024-11-22 01:16:35 +03:00
Fix HTML/MD parser producing empty MessageEntity (#3885)
Closes #3884. The implementation is also simplified.
This commit is contained in:
parent
066820900d
commit
046e2cb605
|
@ -64,40 +64,74 @@ def within_surrogate(text, index, *, length=None):
|
||||||
|
|
||||||
def strip_text(text, entities):
|
def strip_text(text, entities):
|
||||||
"""
|
"""
|
||||||
Strips whitespace from the given text modifying the provided entities.
|
Strips whitespace from the given surrogated text modifying the provided
|
||||||
|
entities, also removing any empty (0-length) entities.
|
||||||
|
|
||||||
This assumes that there are no overlapping entities, that their length
|
This assumes that the length of entities is greater or equal to 0, and
|
||||||
is greater or equal to one, and that their length is not out of bounds.
|
that no entity is out of bounds.
|
||||||
"""
|
"""
|
||||||
if not entities:
|
if not entities:
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
while text and text[-1].isspace():
|
len_ori = len(text)
|
||||||
e = entities[-1]
|
text = text.lstrip()
|
||||||
if e.offset + e.length == len(text):
|
left_offset = len_ori - len(text)
|
||||||
if e.length == 1:
|
text = text.rstrip()
|
||||||
del entities[-1]
|
len_final = len(text)
|
||||||
if not entities:
|
|
||||||
return text.strip()
|
for i in reversed(range(len(entities))):
|
||||||
|
e = entities[i]
|
||||||
|
if e.length == 0:
|
||||||
|
del entities[i]
|
||||||
|
continue
|
||||||
|
|
||||||
|
if e.offset + e.length > left_offset:
|
||||||
|
if e.offset >= left_offset:
|
||||||
|
# 0 1|2 3 4 5 | 0 1|2 3 4 5
|
||||||
|
# ^ ^ | ^
|
||||||
|
# lo(2) o(5) | o(2)/lo(2)
|
||||||
|
e.offset -= left_offset
|
||||||
|
# |0 1 2 3 | |0 1 2 3
|
||||||
|
# ^ | ^
|
||||||
|
# o=o-lo(3=5-2) | o=o-lo(0=2-2)
|
||||||
else:
|
else:
|
||||||
e.length -= 1
|
# e.offset < left_offset and e.offset + e.length > left_offset
|
||||||
text = text[:-1]
|
# 0 1 2 3|4 5 6 7 8 9 10
|
||||||
|
# ^ ^ ^
|
||||||
|
# o(1) lo(4) o+l(1+9)
|
||||||
|
e.length = e.offset + e.length - left_offset
|
||||||
|
e.offset = 0
|
||||||
|
# |0 1 2 3 4 5 6
|
||||||
|
# ^ ^
|
||||||
|
# o(0) o+l=0+o+l-lo(6=0+6=0+1+9-4)
|
||||||
|
else:
|
||||||
|
# e.offset + e.length <= left_offset
|
||||||
|
# 0 1 2 3|4 5
|
||||||
|
# ^ ^
|
||||||
|
# o(0) o+l(4)
|
||||||
|
# lo(4)
|
||||||
|
del entities[i]
|
||||||
|
continue
|
||||||
|
|
||||||
while text and text[0].isspace():
|
if e.offset + e.length <= len_final:
|
||||||
for i in reversed(range(len(entities))):
|
# |0 1 2 3 4 5 6 7 8 9
|
||||||
e = entities[i]
|
# ^ ^
|
||||||
if e.offset != 0:
|
# o(1) o+l(1+9)/lf(10)
|
||||||
e.offset -= 1
|
continue
|
||||||
continue
|
if e.offset >= len_final:
|
||||||
|
# |0 1 2 3 4
|
||||||
if e.length == 1:
|
# ^
|
||||||
del entities[0]
|
# o(5)/lf(5)
|
||||||
if not entities:
|
del entities[i]
|
||||||
return text.lstrip()
|
else:
|
||||||
else:
|
# e.offset < len_final and e.offset + e.length > len_final
|
||||||
e.length -= 1
|
# |0 1 2 3 4 5 (6) (7) (8) (9)
|
||||||
|
# ^ ^ ^
|
||||||
text = text[1:]
|
# o(1) lf(6) o+l(1+8)
|
||||||
|
e.length = len_final - e.offset
|
||||||
|
# |0 1 2 3 4 5
|
||||||
|
# ^ ^
|
||||||
|
# o(1) o+l=o+lf-o=lf(6=1+5=1+6-1)
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
|
@ -7,11 +7,34 @@ from base64 import b64decode
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from telethon import helpers
|
from telethon import helpers
|
||||||
|
from telethon.utils import get_inner_text
|
||||||
|
from telethon.tl.types import MessageEntityUnknown as Meu
|
||||||
|
|
||||||
|
|
||||||
def test_strip_text():
|
def test_strip_text():
|
||||||
assert helpers.strip_text(" text ", []) == "text"
|
text = ' text '
|
||||||
# I can't interpret the rest of the code well enough yet
|
text_stripped = 'text'
|
||||||
|
entities_before_and_after = (
|
||||||
|
([], []),
|
||||||
|
([Meu(i, 0) for i in range(10)], []), # del ''
|
||||||
|
([Meu(0, 0), Meu(0, 1), Meu(5, 1)], []), # del '', ' ', ' '
|
||||||
|
([Meu(0, 3)], [Meu(0, 2)]), # ' te' -> 'te'
|
||||||
|
([Meu(3, 1)], [Meu(2, 1)]), # 'x'
|
||||||
|
([Meu(3, 2)], [Meu(2, 2)]), # 'xt'
|
||||||
|
([Meu(3, 3)], [Meu(2, 2)]), # 'xt ' -> 'xt'
|
||||||
|
([Meu(0, 6)], [Meu(0, 4)]), # ' text ' -> 'text'
|
||||||
|
)
|
||||||
|
for entities_before, entities_expected in entities_before_and_after:
|
||||||
|
entities_for_test = [Meu(meu.offset, meu.length) for meu in entities_before] # deep copy
|
||||||
|
text_after = helpers.strip_text(text, entities_for_test)
|
||||||
|
assert text_after == text_stripped
|
||||||
|
assert sorted((e.offset, e.length) for e in entities_for_test) \
|
||||||
|
== sorted((e.offset, e.length) for e in entities_expected)
|
||||||
|
inner_text_before = get_inner_text(text, entities_before)
|
||||||
|
inner_text_before_stripped = [t.strip() for t in inner_text_before]
|
||||||
|
inner_text_after = get_inner_text(text_after, entities_for_test)
|
||||||
|
for t in inner_text_after:
|
||||||
|
assert t in inner_text_before_stripped
|
||||||
|
|
||||||
|
|
||||||
class TestSyncifyAsyncContext:
|
class TestSyncifyAsyncContext:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user