mirror of
https://github.com/LonamiWebs/Telethon.git
synced 2025-06-24 23:43:10 +03:00
Fix unparsing text with malformed message entities
This commit is contained in:
parent
ccbc1c669c
commit
f3111f93b2
|
@ -168,9 +168,21 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0,
|
||||||
continue
|
continue
|
||||||
|
|
||||||
skip_entity = False
|
skip_entity = False
|
||||||
entity_text = unparse(text=text[relative_offset:relative_offset + entity.length],
|
length = entity.length
|
||||||
|
|
||||||
|
# If we are in the middle of a surrogate nudge the position by +1.
|
||||||
|
# Otherwise we would end up with malformed text and fail to encode.
|
||||||
|
# For example of bad input: "Hi \ud83d\ude1c"
|
||||||
|
# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
|
||||||
|
if '\ud800' <= text[relative_offset] <= '\udfff':
|
||||||
|
relative_offset += 1
|
||||||
|
|
||||||
|
if '\ud800' <= text[relative_offset + length] <= '\udfff':
|
||||||
|
length += 1
|
||||||
|
|
||||||
|
entity_text = unparse(text=text[relative_offset:relative_offset + length],
|
||||||
entities=entities[i + 1:],
|
entities=entities[i + 1:],
|
||||||
_offset=entity.offset, _length=entity.length)
|
_offset=entity.offset, _length=length)
|
||||||
entity_type = type(entity)
|
entity_type = type(entity)
|
||||||
|
|
||||||
if entity_type == MessageEntityBold:
|
if entity_type == MessageEntityBold:
|
||||||
|
@ -208,6 +220,10 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0,
|
||||||
.format(entity.user_id, entity_text))
|
.format(entity.user_id, entity_text))
|
||||||
else:
|
else:
|
||||||
skip_entity = True
|
skip_entity = True
|
||||||
last_offset = relative_offset + (0 if skip_entity else entity.length)
|
last_offset = relative_offset + (0 if skip_entity else length)
|
||||||
|
|
||||||
|
if last_offset < len(text) and '\ud800' <= text[last_offset] <= '\udfff':
|
||||||
|
last_offset += 1
|
||||||
|
|
||||||
html.append(escape(text[last_offset:]))
|
html.append(escape(text[last_offset:]))
|
||||||
return _del_surrogate(''.join(html))
|
return _del_surrogate(''.join(html))
|
||||||
|
|
|
@ -184,6 +184,14 @@ def unparse(text, entities, delimiters=None, url_fmt=None):
|
||||||
insert_at.sort(key=lambda t: t[0])
|
insert_at.sort(key=lambda t: t[0])
|
||||||
while insert_at:
|
while insert_at:
|
||||||
at, what = insert_at.pop()
|
at, what = insert_at.pop()
|
||||||
|
|
||||||
|
# If we are in the middle of a surrogate nudge the position by +1.
|
||||||
|
# Otherwise we would end up with malformed text and fail to encode.
|
||||||
|
# For example of bad input: "Hi \ud83d\ude1c"
|
||||||
|
# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
|
||||||
|
if '\ud800' <= text[at] <= '\udfff':
|
||||||
|
at += 1
|
||||||
|
|
||||||
text = text[:at] + what + text[at:]
|
text = text[:at] + what + text[at:]
|
||||||
|
|
||||||
return del_surrogate(text)
|
return del_surrogate(text)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user