From 5f2f04c6c2676201cbb19e894d7d5bb3f3335ff0 Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Mon, 22 Jan 2018 11:06:11 +0200 Subject: [PATCH] Add HTML parse mode (#554) --- telethon/extensions/html.py | 167 ++++++++++++++++++++++++++++++++++++ telethon/telegram_client.py | 4 +- 2 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 telethon/extensions/html.py diff --git a/telethon/extensions/html.py b/telethon/extensions/html.py new file mode 100644 index 00000000..8cd170cb --- /dev/null +++ b/telethon/extensions/html.py @@ -0,0 +1,167 @@ +""" +Simple HTML -> Telegram entity parser. +""" +from html import escape, unescape +from html.parser import HTMLParser +from collections import deque + +from ..tl.types import ( + MessageEntityBold, MessageEntityItalic, MessageEntityCode, + MessageEntityPre, MessageEntityEmail, MessageEntityUrl, + MessageEntityTextUrl +) + + +class HTMLToTelegramParser(HTMLParser): + def __init__(self): + super().__init__() + self.text = '' + self.entities = [] + self._building_entities = {} + self._open_tags = deque() + self._open_tags_meta = deque() + + def handle_starttag(self, tag, attrs): + self._open_tags.appendleft(tag) + self._open_tags_meta.appendleft(None) + + attrs = dict(attrs) + EntityType = None + args = {} + if tag == 'strong' or tag == 'b': + EntityType = MessageEntityBold + elif tag == 'em' or tag == 'i': + EntityType = MessageEntityItalic + elif tag == 'code': + try: + # If we're in the middle of a
 tag, this  tag is
+                # probably intended for syntax highlighting.
+                #
+                # Syntax highlighting is set with
+                #     codeblock
+                # inside 
 tags
+                pre = self._building_entities['pre']
+                try:
+                    pre.language = attrs['class'][len('language-'):]
+                except KeyError:
+                    pass
+            except KeyError:
+                EntityType = MessageEntityCode
+        elif tag == 'pre':
+            EntityType = MessageEntityPre
+            args['language'] = ''
+        elif tag == 'a':
+            try:
+                url = attrs['href']
+            except KeyError:
+                return
+            if url.startswith('mailto:'):
+                url = url[len('mailto:'):]
+                EntityType = MessageEntityEmail
+            else:
+                if self.get_starttag_text() == url:
+                    EntityType = MessageEntityUrl
+                else:
+                    EntityType = MessageEntityTextUrl
+                    args['url'] = url
+                    url = None
+            self._open_tags_meta.popleft()
+            self._open_tags_meta.appendleft(url)
+
+        if EntityType and tag not in self._building_entities:
+            self._building_entities[tag] = EntityType(
+                offset=len(self.text),
+                # The length will be determined when closing the tag.
+                length=0,
+                **args)
+
+    def handle_data(self, text):
+        text = unescape(text)
+
+        previous_tag = self._open_tags[0] if len(self._open_tags) > 0 else ''
+        if previous_tag == 'a':
+            url = self._open_tags_meta[0]
+            if url:
+                text = url
+
+        for tag, entity in self._building_entities.items():
+            entity.length += len(text.strip('\n'))
+
+        self.text += text
+
+    def handle_endtag(self, tag):
+        try:
+            self._open_tags.popleft()
+            self._open_tags_meta.popleft()
+        except IndexError:
+            pass
+        entity = self._building_entities.pop(tag, None)
+        if entity:
+            self.entities.append(entity)
+
+
+def parse(html):
+    """
+    Parses the given HTML message and returns its stripped representation
+    plus a list of the MessageEntity's that were found.
+
+    :param message: the message with HTML to be parsed.
+    :return: a tuple consisting of (clean message, [message entities]).
+    """
+    parser = HTMLToTelegramParser()
+    parser.feed(html)
+    return parser.text, parser.entities
+
+
+def unparse(text, entities):
+    """
+    Performs the reverse operation to .parse(), effectively returning HTML
+    given a normal text and its MessageEntity's.
+
+    :param text: the text to be reconverted into HTML.
+    :param entities: the MessageEntity's applied to the text.
+    :return: a HTML representation of the combination of both inputs.
+    """
+    if not entities:
+        return text
+    html = []
+    last_offset = 0
+    for entity in entities:
+        if entity.offset > last_offset:
+            html.append(escape(text[last_offset:entity.offset]))
+        elif entity.offset < last_offset:
+            continue
+
+        skip_entity = False
+        entity_text = escape(text[entity.offset:entity.offset + entity.length])
+        entity_type = type(entity)
+
+        if entity_type == MessageEntityBold:
+            html.append('{}'.format(entity_text))
+        elif entity_type == MessageEntityItalic:
+            html.append('{}'.format(entity_text))
+        elif entity_type == MessageEntityCode:
+            html.append('{}'.format(entity_text))
+        elif entity_type == MessageEntityPre:
+            if entity.language:
+                html.append(
+                    "
\n"
+                    "    \n"
+                    "        {}\n"
+                    "    \n"
+                    "
".format(entity.language, entity_text)) + else: + html.append('
{}
' + .format(entity_text)) + elif entity_type == MessageEntityEmail: + html.append('{0}'.format(entity_text)) + elif entity_type == MessageEntityUrl: + html.append('{0}'.format(entity_text)) + elif entity_type == MessageEntityTextUrl: + html.append('{}' + .format(escape(entity.url), entity_text)) + else: + skip_entity = True + last_offset = entity.offset + (0 if skip_entity else entity.length) + html.append(text[last_offset:]) + return ''.join(html) diff --git a/telethon/telegram_client.py b/telethon/telegram_client.py index 5fe186f3..67644a7e 100644 --- a/telethon/telegram_client.py +++ b/telethon/telegram_client.py @@ -70,7 +70,7 @@ from .tl.types import ( InputDocument, InputMediaDocument ) from .tl.types.messages import DialogsSlice -from .extensions import markdown +from .extensions import markdown, html __log__ = logging.getLogger(__name__) @@ -580,6 +580,8 @@ class TelegramClient(TelegramBareClient): parse_mode = parse_mode.lower() if parse_mode in {'md', 'markdown'}: message, msg_entities = markdown.parse(message) + elif parse_mode.startswith('htm'): + message, msg_entities = html.parse(message) else: raise ValueError('Unknown parsing mode: {}'.format(parse_mode)) else: