Fix HTML entity parsing failing when needing surrogates

This commit is contained in:
Lonami Exo 2018-02-15 11:52:46 +01:00
parent 178643d3a1
commit 75d99fbb53
2 changed files with 22 additions and 4 deletions

View File

@ -1,9 +1,10 @@
""" """
Simple HTML -> Telegram entity parser. Simple HTML -> Telegram entity parser.
""" """
import struct
from collections import deque
from html import escape, unescape from html import escape, unescape
from html.parser import HTMLParser from html.parser import HTMLParser
from collections import deque
from ..tl.types import ( from ..tl.types import (
MessageEntityBold, MessageEntityItalic, MessageEntityCode, MessageEntityBold, MessageEntityItalic, MessageEntityCode,
@ -12,6 +13,18 @@ from ..tl.types import (
) )
# Helpers from markdown.py
def _add_surrogate(text):
return ''.join(
''.join(chr(y) for y in struct.unpack('<HH', x.encode('utf-16le')))
if (0x10000 <= ord(x) <= 0x10FFFF) else x for x in text
)
def _del_surrogate(text):
return text.encode('utf-16', 'surrogatepass').decode('utf-16')
class HTMLToTelegramParser(HTMLParser): class HTMLToTelegramParser(HTMLParser):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
@ -109,8 +122,8 @@ def parse(html):
:return: a tuple consisting of (clean message, [message entities]). :return: a tuple consisting of (clean message, [message entities]).
""" """
parser = HTMLToTelegramParser() parser = HTMLToTelegramParser()
parser.feed(html) parser.feed(_add_surrogate(html))
return parser.text, parser.entities return _del_surrogate(parser.text), parser.entities
def unparse(text, entities): def unparse(text, entities):
@ -124,6 +137,8 @@ def unparse(text, entities):
""" """
if not entities: if not entities:
return text return text
text = _add_surrogate(text)
html = [] html = []
last_offset = 0 last_offset = 0
for entity in entities: for entity in entities:
@ -164,4 +179,4 @@ def unparse(text, entities):
skip_entity = True skip_entity = True
last_offset = entity.offset + (0 if skip_entity else entity.length) last_offset = entity.offset + (0 if skip_entity else entity.length)
html.append(text[last_offset:]) html.append(text[last_offset:])
return ''.join(html) return _del_surrogate(''.join(html))

View File

@ -152,6 +152,9 @@ def unparse(text, entities, delimiters=None, url_fmt=None):
:param entities: the MessageEntity's applied to the text. :param entities: the MessageEntity's applied to the text.
:return: a markdown-like text representing the combination of both inputs. :return: a markdown-like text representing the combination of both inputs.
""" """
if not entities:
return text
if not delimiters: if not delimiters:
if delimiters is not None: if delimiters is not None:
return text return text