mirror of
https://github.com/LonamiWebs/Telethon.git
synced 2024-11-22 17:36:34 +03:00
Fix HTML entity parsing failing when needing surrogates
This commit is contained in:
parent
178643d3a1
commit
75d99fbb53
|
@ -1,9 +1,10 @@
|
||||||
"""
|
"""
|
||||||
Simple HTML -> Telegram entity parser.
|
Simple HTML -> Telegram entity parser.
|
||||||
"""
|
"""
|
||||||
|
import struct
|
||||||
|
from collections import deque
|
||||||
from html import escape, unescape
|
from html import escape, unescape
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from collections import deque
|
|
||||||
|
|
||||||
from ..tl.types import (
|
from ..tl.types import (
|
||||||
MessageEntityBold, MessageEntityItalic, MessageEntityCode,
|
MessageEntityBold, MessageEntityItalic, MessageEntityCode,
|
||||||
|
@ -12,6 +13,18 @@ from ..tl.types import (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Helpers from markdown.py
|
||||||
|
def _add_surrogate(text):
|
||||||
|
return ''.join(
|
||||||
|
''.join(chr(y) for y in struct.unpack('<HH', x.encode('utf-16le')))
|
||||||
|
if (0x10000 <= ord(x) <= 0x10FFFF) else x for x in text
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _del_surrogate(text):
|
||||||
|
return text.encode('utf-16', 'surrogatepass').decode('utf-16')
|
||||||
|
|
||||||
|
|
||||||
class HTMLToTelegramParser(HTMLParser):
|
class HTMLToTelegramParser(HTMLParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
@ -109,8 +122,8 @@ def parse(html):
|
||||||
:return: a tuple consisting of (clean message, [message entities]).
|
:return: a tuple consisting of (clean message, [message entities]).
|
||||||
"""
|
"""
|
||||||
parser = HTMLToTelegramParser()
|
parser = HTMLToTelegramParser()
|
||||||
parser.feed(html)
|
parser.feed(_add_surrogate(html))
|
||||||
return parser.text, parser.entities
|
return _del_surrogate(parser.text), parser.entities
|
||||||
|
|
||||||
|
|
||||||
def unparse(text, entities):
|
def unparse(text, entities):
|
||||||
|
@ -124,6 +137,8 @@ def unparse(text, entities):
|
||||||
"""
|
"""
|
||||||
if not entities:
|
if not entities:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
text = _add_surrogate(text)
|
||||||
html = []
|
html = []
|
||||||
last_offset = 0
|
last_offset = 0
|
||||||
for entity in entities:
|
for entity in entities:
|
||||||
|
@ -164,4 +179,4 @@ def unparse(text, entities):
|
||||||
skip_entity = True
|
skip_entity = True
|
||||||
last_offset = entity.offset + (0 if skip_entity else entity.length)
|
last_offset = entity.offset + (0 if skip_entity else entity.length)
|
||||||
html.append(text[last_offset:])
|
html.append(text[last_offset:])
|
||||||
return ''.join(html)
|
return _del_surrogate(''.join(html))
|
||||||
|
|
|
@ -152,6 +152,9 @@ def unparse(text, entities, delimiters=None, url_fmt=None):
|
||||||
:param entities: the MessageEntity's applied to the text.
|
:param entities: the MessageEntity's applied to the text.
|
||||||
:return: a markdown-like text representing the combination of both inputs.
|
:return: a markdown-like text representing the combination of both inputs.
|
||||||
"""
|
"""
|
||||||
|
if not entities:
|
||||||
|
return text
|
||||||
|
|
||||||
if not delimiters:
|
if not delimiters:
|
||||||
if delimiters is not None:
|
if delimiters is not None:
|
||||||
return text
|
return text
|
||||||
|
|
Loading…
Reference in New Issue
Block a user