diff --git a/telethon/extensions/html.py b/telethon/extensions/html.py index a25ed58b..564dcf13 100644 --- a/telethon/extensions/html.py +++ b/telethon/extensions/html.py @@ -7,7 +7,8 @@ from html import escape from html.parser import HTMLParser from typing import Iterable, Optional, Tuple, List -from .. import helpers +from ..helpers import add_surrogate, del_surrogate, within_surrogate, strip_text +from ..tl import TLObject from ..tl.types import ( MessageEntityBold, MessageEntityItalic, MessageEntityCode, MessageEntityPre, MessageEntityEmail, MessageEntityUrl, @@ -17,18 +18,6 @@ from ..tl.types import ( ) -# Helpers from markdown.py -def _add_surrogate(text): - return ''.join( - ''.join(chr(y) for y in struct.unpack(' Tuple[str, List[TypeMessageEntity]]: return html, [] parser = HTMLToTelegramParser() - parser.feed(_add_surrogate(html)) - text = helpers.strip_text(parser.text, parser.entities) - return _del_surrogate(text), parser.entities + parser.feed(add_surrogate(html)) + text = strip_text(parser.text, parser.entities) + return del_surrogate(text), parser.entities -def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0, - _length: Optional[int] = None) -> str: +ENTITY_TO_FORMATTER = { + MessageEntityBold: ('', ''), + MessageEntityItalic: ('', ''), + MessageEntityCode: ('', ''), + MessageEntityUnderline: ('', ''), + MessageEntityStrike: ('', ''), + MessageEntityBlockquote: ('
', '
'), + MessageEntityPre: lambda e, _: ( + "
\n"
+        "    \n"
+        "        ".format(e.language), "{}\n"
+        "    \n"
+        "
" + ), + MessageEntityEmail: lambda _, t: (''.format(t), ''), + MessageEntityUrl: lambda _, t: (''.format(t), ''), + MessageEntityTextUrl: lambda e, _: (''.format(escape(e.url)), ''), + MessageEntityMentionName: lambda e, _: (''.format(e.user_id), ''), +} + + +def unparse(text: str, entities: Iterable[TypeMessageEntity]) -> str: """ Performs the reverse operation to .parse(), effectively returning HTML given a normal text and its MessageEntity's. @@ -153,77 +162,32 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0, elif not entities: return escape(text) - text = _add_surrogate(text) - if _length is None: - _length = len(text) - html = [] - last_offset = 0 - for i, entity in enumerate(entities): - if entity.offset >= _offset + _length: - break - relative_offset = entity.offset - _offset - if relative_offset > last_offset: - html.append(escape(text[last_offset:relative_offset])) - elif relative_offset < last_offset: - continue + if isinstance(entities, TLObject): + entities = (entities,) - skip_entity = False - length = entity.length + text = add_surrogate(text) + insert_at = [] + for entity in entities: + s = entity.offset + e = entity.offset + entity.length + delimiter = ENTITY_TO_FORMATTER.get(type(entity), None) + if delimiter: + if callable(delimiter): + delimiter = delimiter(entity, text[s:e]) + insert_at.append((s, delimiter[0])) + insert_at.append((e, delimiter[1])) - # If we are in the middle of a surrogate nudge the position by +1. - # Otherwise we would end up with malformed text and fail to encode. - # For example of bad input: "Hi \ud83d\ude1c" - # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF - while helpers.within_surrogate(text, relative_offset, length=_length): - relative_offset += 1 + insert_at.sort(key=lambda t: t[0]) + next_escape_bound = len(text) + while insert_at: + # Same logic as markdown.py + at, what = insert_at.pop() + while within_surrogate(text, at): + at += 1 - while helpers.within_surrogate(text, relative_offset + length, length=_length): - length += 1 + text = text[:at] + what + escape(text[at:next_escape_bound]) + text[next_escape_bound:] + next_escape_bound = at - entity_text = unparse(text=text[relative_offset:relative_offset + length], - entities=entities[i + 1:], - _offset=entity.offset, _length=length) - entity_type = type(entity) + text = escape(text[:next_escape_bound]) + text[next_escape_bound:] - if entity_type == MessageEntityBold: - html.append('{}'.format(entity_text)) - elif entity_type == MessageEntityItalic: - html.append('{}'.format(entity_text)) - elif entity_type == MessageEntityCode: - html.append('{}'.format(entity_text)) - elif entity_type == MessageEntityUnderline: - html.append('{}'.format(entity_text)) - elif entity_type == MessageEntityStrike: - html.append('{}'.format(entity_text)) - elif entity_type == MessageEntityBlockquote: - html.append('
{}
'.format(entity_text)) - elif entity_type == MessageEntityPre: - if entity.language: - html.append( - "
\n"
-                    "    \n"
-                    "        {}\n"
-                    "    \n"
-                    "
".format(entity.language, entity_text)) - else: - html.append('
{}
' - .format(entity_text)) - elif entity_type == MessageEntityEmail: - html.append('{0}'.format(entity_text)) - elif entity_type == MessageEntityUrl: - html.append('{0}'.format(entity_text)) - elif entity_type == MessageEntityTextUrl: - html.append('{}' - .format(escape(entity.url), entity_text)) - elif entity_type == MessageEntityMentionName: - html.append('{}' - .format(entity.user_id, entity_text)) - else: - skip_entity = True - last_offset = relative_offset + (0 if skip_entity else length) - - while helpers.within_surrogate(text, last_offset, length=_length): - last_offset += 1 - - html.append(escape(text[last_offset:])) - return _del_surrogate(''.join(html)) + return del_surrogate(text)