mirror of
https://github.com/LonamiWebs/Telethon.git
synced 2024-11-22 09:26:37 +03:00
Fix within surrogate detection
This commit is contained in:
parent
3a6c955c90
commit
3d32e16235
|
@ -174,12 +174,10 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0,
|
|||
# Otherwise we would end up with malformed text and fail to encode.
|
||||
# For example of bad input: "Hi \ud83d\ude1c"
|
||||
# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
|
||||
while (relative_offset < _length
|
||||
and '\ud800' <= text[relative_offset] <= '\udfff'):
|
||||
while helpers.within_surrogate(text, relative_offset, length=_length):
|
||||
relative_offset += 1
|
||||
|
||||
while (relative_offset + length < _length
|
||||
and '\ud800' <= text[relative_offset + length] <= '\udfff'):
|
||||
while helpers.within_surrogate(text, relative_offset + length, length=_length):
|
||||
length += 1
|
||||
|
||||
entity_text = unparse(text=text[relative_offset:relative_offset + length],
|
||||
|
@ -224,7 +222,7 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0,
|
|||
skip_entity = True
|
||||
last_offset = relative_offset + (0 if skip_entity else length)
|
||||
|
||||
while last_offset < _length and '\ud800' <= text[last_offset] <= '\udfff':
|
||||
while helpers.within_surrogate(text, last_offset, length=_length):
|
||||
last_offset += 1
|
||||
|
||||
html.append(escape(text[last_offset:]))
|
||||
|
|
|
@ -6,7 +6,7 @@ since they seem to count as two characters and it's a bit strange.
|
|||
import re
|
||||
import warnings
|
||||
|
||||
from ..helpers import add_surrogate, del_surrogate, strip_text
|
||||
from ..helpers import add_surrogate, del_surrogate, within_surrogate, strip_text
|
||||
from ..tl import TLObject
|
||||
from ..tl.types import (
|
||||
MessageEntityBold, MessageEntityItalic, MessageEntityCode,
|
||||
|
@ -185,11 +185,11 @@ def unparse(text, entities, delimiters=None, url_fmt=None):
|
|||
while insert_at:
|
||||
at, what = insert_at.pop()
|
||||
|
||||
# If we are in the middle of a surrogate nudge the position by +1.
|
||||
# If we are in the middle of a surrogate nudge the position by -1.
|
||||
# Otherwise we would end up with malformed text and fail to encode.
|
||||
# For example of bad input: "Hi \ud83d\ude1c"
|
||||
# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
|
||||
while at < len(text) and '\ud800' <= text[at] <= '\udfff':
|
||||
while within_surrogate(text, at):
|
||||
at += 1
|
||||
|
||||
text = text[:at] + what + text[at:]
|
||||
|
|
|
@ -40,6 +40,20 @@ def del_surrogate(text):
|
|||
return text.encode('utf-16', 'surrogatepass').decode('utf-16')
|
||||
|
||||
|
||||
def within_surrogate(text, index, *, length=None):
|
||||
"""
|
||||
`True` if ``index`` is within a surrogate (before and after it, not at!).
|
||||
"""
|
||||
if length is None:
|
||||
length = len(text)
|
||||
|
||||
return (
|
||||
1 < index < len(text) and # in bounds
|
||||
'\ud800' <= text[index - 1] <= '\udfff' and # previous is
|
||||
'\ud800' <= text[index] <= '\udfff' # current is
|
||||
)
|
||||
|
||||
|
||||
def strip_text(text, entities):
|
||||
"""
|
||||
Strips whitespace from the given text modifying the provided entities.
|
||||
|
|
|
@ -23,7 +23,7 @@ def test_malformed_entities():
|
|||
text = '🏆Telegram Official Android Challenge is over🏆.'
|
||||
entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')]
|
||||
result = html.unparse(text, entities)
|
||||
assert result == '🏆<a href="https://example.com">Telegram Official Android Challenge is over🏆</a>.'
|
||||
assert result == '🏆<a href="https://example.com">Telegram Official Android Challenge is over</a>🏆.'
|
||||
|
||||
|
||||
def test_trailing_malformed_entities():
|
||||
|
@ -35,7 +35,7 @@ def test_trailing_malformed_entities():
|
|||
text = '🏆Telegram Official Android Challenge is over🏆'
|
||||
entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')]
|
||||
result = html.unparse(text, entities)
|
||||
assert result == '🏆<a href="https://example.com">Telegram Official Android Challenge is over🏆</a>'
|
||||
assert result == '🏆<a href="https://example.com">Telegram Official Android Challenge is over</a>🏆'
|
||||
|
||||
|
||||
def test_entities_together():
|
||||
|
@ -51,3 +51,15 @@ def test_entities_together():
|
|||
|
||||
text = html.unparse(text, entities)
|
||||
assert text == original
|
||||
|
||||
|
||||
def test_offset_at_emoji():
|
||||
"""
|
||||
Tests that an entity starting at a emoji preserves the emoji.
|
||||
"""
|
||||
text = 'Hi\n👉 See example'
|
||||
entities = [MessageEntityBold(0, 2), MessageEntityItalic(3, 2), MessageEntityBold(10, 7)]
|
||||
parsed = '<strong>Hi</strong>\n<em>👉</em> See <strong>example</strong>'
|
||||
|
||||
assert html.parse(parsed) == (text, entities)
|
||||
assert html.unparse(text, entities) == parsed
|
||||
|
|
|
@ -23,7 +23,7 @@ def test_malformed_entities():
|
|||
text = '🏆Telegram Official Android Challenge is over🏆.'
|
||||
entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')]
|
||||
result = markdown.unparse(text, entities)
|
||||
assert result == "🏆[Telegram Official Android Challenge is over🏆](https://example.com)."
|
||||
assert result == "🏆[Telegram Official Android Challenge is over](https://example.com)🏆."
|
||||
|
||||
|
||||
def test_trailing_malformed_entities():
|
||||
|
@ -35,7 +35,7 @@ def test_trailing_malformed_entities():
|
|||
text = '🏆Telegram Official Android Challenge is over🏆'
|
||||
entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')]
|
||||
result = markdown.unparse(text, entities)
|
||||
assert result == "🏆[Telegram Official Android Challenge is over🏆](https://example.com)"
|
||||
assert result == "🏆[Telegram Official Android Challenge is over](https://example.com)🏆"
|
||||
|
||||
|
||||
def test_entities_together():
|
||||
|
@ -51,3 +51,15 @@ def test_entities_together():
|
|||
|
||||
text = markdown.unparse(text, entities)
|
||||
assert text == original
|
||||
|
||||
|
||||
def test_offset_at_emoji():
|
||||
"""
|
||||
Tests that an entity starting at a emoji preserves the emoji.
|
||||
"""
|
||||
text = 'Hi\n👉 See example'
|
||||
entities = [MessageEntityBold(0, 2), MessageEntityItalic(3, 2), MessageEntityBold(10, 7)]
|
||||
parsed = '**Hi**\n__👉__ See **example**'
|
||||
|
||||
assert markdown.parse(parsed) == (text, entities)
|
||||
assert markdown.unparse(text, entities) == parsed
|
||||
|
|
Loading…
Reference in New Issue
Block a user