Telethon/telethon/extensions/html.py

"""
Simple HTML -> Telegram entity parser.
"""
import struct
from collections import deque
from html import escape
from html.parser import HTMLParser
from typing import Iterable, Optional, Tuple, List

from .. import helpers
from ..tl.types import (
    MessageEntityBold, MessageEntityItalic, MessageEntityCode,
    MessageEntityPre, MessageEntityEmail, MessageEntityUrl,
    MessageEntityTextUrl, MessageEntityMentionName,
    MessageEntityUnderline, MessageEntityStrike, MessageEntityBlockquote,
    TypeMessageEntity
)


# Helpers from markdown.py
def _add_surrogate(text):
    return ''.join(
        ''.join(chr(y) for y in struct.unpack('<HH', x.encode('utf-16le')))
        if (0x10000 <= ord(x) <= 0x10FFFF) else x for x in text
    )


def _del_surrogate(text):
    return text.encode('utf-16', 'surrogatepass').decode('utf-16')


class HTMLToTelegramParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.text = ''
        self.entities = []
        self._building_entities = {}
        self._open_tags = deque()
        self._open_tags_meta = deque()

    def handle_starttag(self, tag, attrs):
        self._open_tags.appendleft(tag)
        self._open_tags_meta.appendleft(None)

        attrs = dict(attrs)
        EntityType = None
        args = {}
        if tag == 'strong' or tag == 'b':
            EntityType = MessageEntityBold
        elif tag == 'em' or tag == 'i':
            EntityType = MessageEntityItalic
        elif tag == 'u':
            EntityType = MessageEntityUnderline
        elif tag == 'del' or tag == 's':
            EntityType = MessageEntityStrike
        elif tag == 'blockquote':
            EntityType = MessageEntityBlockquote
        elif tag == 'code':
            try:
                # If we're in the middle of a <pre> tag, this <code> tag is
                # probably intended for syntax highlighting.
                #
                # Syntax highlighting is set with
                #     <code class='language-...'>codeblock</code>
                # inside <pre> tags
                pre = self._building_entities['pre']
                try:
                    pre.language = attrs['class'][len('language-'):]
                except KeyError:
                    pass
            except KeyError:
                EntityType = MessageEntityCode
        elif tag == 'pre':
            EntityType = MessageEntityPre
            args['language'] = ''
        elif tag == 'a':
            try:
                url = attrs['href']
            except KeyError:
                return
            if url.startswith('mailto:'):
                url = url[len('mailto:'):]
                EntityType = MessageEntityEmail
            else:
                if self.get_starttag_text() == url:
                    EntityType = MessageEntityUrl
                else:
                    EntityType = MessageEntityTextUrl
                    args['url'] = url
                    url = None
            self._open_tags_meta.popleft()
            self._open_tags_meta.appendleft(url)

        if EntityType and tag not in self._building_entities:
            self._building_entities[tag] = EntityType(
                offset=len(self.text),
                # The length will be determined when closing the tag.
                length=0,
                **args)

    def handle_data(self, text):
        previous_tag = self._open_tags[0] if len(self._open_tags) > 0 else ''
        if previous_tag == 'a':
            url = self._open_tags_meta[0]
            if url:
                text = url

        for tag, entity in self._building_entities.items():
            entity.length += len(text)

        self.text += text

    def handle_endtag(self, tag):
        try:
            self._open_tags.popleft()
            self._open_tags_meta.popleft()
        except IndexError:
            pass
        entity = self._building_entities.pop(tag, None)
        if entity:
            self.entities.append(entity)


def parse(html: str) -> Tuple[str, List[TypeMessageEntity]]:
    """
    Parses the given HTML message and returns its stripped representation
    plus a list of the MessageEntity's that were found.

    :param html: the message with HTML to be parsed.
    :return: a tuple consisting of (clean message, [message entities]).
    """
    if not html:
        return html, []

    parser = HTMLToTelegramParser()
    parser.feed(_add_surrogate(html))
    text = helpers.strip_text(parser.text, parser.entities)
    return _del_surrogate(text), parser.entities


def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0,
            _length: Optional[int] = None) -> str:
    """
    Performs the reverse operation to .parse(), effectively returning HTML
    given a normal text and its MessageEntity's.

    :param text: the text to be reconverted into HTML.
    :param entities: the MessageEntity's applied to the text.
    :return: a HTML representation of the combination of both inputs.
    """
    if not text:
        return text
    elif not entities:
        return escape(text)

    text = _add_surrogate(text)
    if _length is None:
        _length = len(text)
    html = []
    last_offset = 0
    for i, entity in enumerate(entities):
        if entity.offset >= _offset + _length:
            break
        relative_offset = entity.offset - _offset
        if relative_offset > last_offset:
            html.append(escape(text[last_offset:relative_offset]))
        elif relative_offset < last_offset:
            continue

        skip_entity = False
        length = entity.length

        # If we are in the middle of a surrogate nudge the position by +1.
        # Otherwise we would end up with malformed text and fail to encode.
        # For example of bad input: "Hi \ud83d\ude1c"
        # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
        while helpers.within_surrogate(text, relative_offset, length=_length):
            relative_offset += 1

        while helpers.within_surrogate(text, relative_offset + length, length=_length):
            length += 1

        entity_text = unparse(text=text[relative_offset:relative_offset + length],
                              entities=entities[i + 1:],
                              _offset=entity.offset, _length=length)
        entity_type = type(entity)

        if entity_type == MessageEntityBold:
            html.append('<strong>{}</strong>'.format(entity_text))
        elif entity_type == MessageEntityItalic:
            html.append('<em>{}</em>'.format(entity_text))
        elif entity_type == MessageEntityCode:
            html.append('<code>{}</code>'.format(entity_text))
        elif entity_type == MessageEntityUnderline:
            html.append('<u>{}</u>'.format(entity_text))
        elif entity_type == MessageEntityStrike:
            html.append('<del>{}</del>'.format(entity_text))
        elif entity_type == MessageEntityBlockquote:
            html.append('<blockquote>{}</blockquote>'.format(entity_text))
        elif entity_type == MessageEntityPre:
            if entity.language:
                html.append(
                    "<pre>\n"
                    "    <code class='language-{}'>\n"
                    "        {}\n"
                    "    </code>\n"
                    "</pre>".format(entity.language, entity_text))
            else:
                html.append('<pre><code>{}</code></pre>'
                            .format(entity_text))
        elif entity_type == MessageEntityEmail:
            html.append('<a href="mailto:{0}">{0}</a>'.format(entity_text))
        elif entity_type == MessageEntityUrl:
            html.append('<a href="{0}">{0}</a>'.format(entity_text))
        elif entity_type == MessageEntityTextUrl:
            html.append('<a href="{}">{}</a>'
                        .format(escape(entity.url), entity_text))
        elif entity_type == MessageEntityMentionName:
            html.append('<a href="tg://user?id={}">{}</a>'
                        .format(entity.user_id, entity_text))
        else:
            skip_entity = True
        last_offset = relative_offset + (0 if skip_entity else length)

    while helpers.within_surrogate(text, last_offset, length=_length):
        last_offset += 1

    html.append(escape(text[last_offset:]))
    return _del_surrogate(''.join(html))
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`"""`
			`Simple HTML -> Telegram entity parser.`
			`"""`
Fix HTML entity parsing failing when needing surrogates 2018-02-15 13:52:46 +03:00			`import struct`
			`from collections import deque`
Remove unwanted html.unescape() call 2019-08-04 11:09:23 +03:00			`from html import escape`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`from html.parser import HTMLParser`
Add support for unparsing nested entities into HTML (#1209) 2019-06-24 13:28:14 +03:00			`from typing import Iterable, Optional, Tuple, List`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00
Locally strip outgoing message text respecting entities 2018-11-19 12:15:56 +03:00			`from .. import helpers`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`from ..tl.types import (`
			`MessageEntityBold, MessageEntityItalic, MessageEntityCode,`
			`MessageEntityPre, MessageEntityEmail, MessageEntityUrl,`
Add new message entities to markdown/html parsers 2019-06-23 22:35:33 +03:00			`MessageEntityTextUrl, MessageEntityMentionName,`
Add support for unparsing nested entities into HTML (#1209) 2019-06-24 13:28:14 +03:00			`MessageEntityUnderline, MessageEntityStrike, MessageEntityBlockquote,`
			`TypeMessageEntity`
Locally strip outgoing message text respecting entities 2018-11-19 12:15:56 +03:00			`)`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00

Fix HTML entity parsing failing when needing surrogates 2018-02-15 13:52:46 +03:00			`# Helpers from markdown.py`
			`def _add_surrogate(text):`
			`return ''.join(`
			`''.join(chr(y) for y in struct.unpack('<HH', x.encode('utf-16le')))`
			`if (0x10000 <= ord(x) <= 0x10FFFF) else x for x in text`
			`)`


			`def _del_surrogate(text):`
			`return text.encode('utf-16', 'surrogatepass').decode('utf-16')`


Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`class HTMLToTelegramParser(HTMLParser):`
			`def __init__(self):`
			`super().__init__()`
			`self.text = ''`
			`self.entities = []`
			`self._building_entities = {}`
			`self._open_tags = deque()`
			`self._open_tags_meta = deque()`

			`def handle_starttag(self, tag, attrs):`
			`self._open_tags.appendleft(tag)`
			`self._open_tags_meta.appendleft(None)`

			`attrs = dict(attrs)`
			`EntityType = None`
			`args = {}`
			`if tag == 'strong' or tag == 'b':`
			`EntityType = MessageEntityBold`
			`elif tag == 'em' or tag == 'i':`
			`EntityType = MessageEntityItalic`
Add new message entities to markdown/html parsers 2019-06-23 22:35:33 +03:00			`elif tag == 'u':`
			`EntityType = MessageEntityUnderline`
			`elif tag == 'del' or tag == 's':`
			`EntityType = MessageEntityStrike`
			`elif tag == 'blockquote':`
			`EntityType = MessageEntityBlockquote`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`elif tag == 'code':`
			`try:`
			`# If we're in the middle of a <pre> tag, this <code> tag is`
			`# probably intended for syntax highlighting.`
			`#`
			`# Syntax highlighting is set with`
			`# <code class='language-...'>codeblock</code>`
			`# inside <pre> tags`
			`pre = self._building_entities['pre']`
			`try:`
			`pre.language = attrs['class'][len('language-'):]`
			`except KeyError:`
			`pass`
			`except KeyError:`
			`EntityType = MessageEntityCode`
			`elif tag == 'pre':`
			`EntityType = MessageEntityPre`
			`args['language'] = ''`
			`elif tag == 'a':`
			`try:`
			`url = attrs['href']`
			`except KeyError:`
			`return`
			`if url.startswith('mailto:'):`
			`url = url[len('mailto:'):]`
			`EntityType = MessageEntityEmail`
			`else:`
			`if self.get_starttag_text() == url:`
			`EntityType = MessageEntityUrl`
			`else:`
			`EntityType = MessageEntityTextUrl`
			`args['url'] = url`
			`url = None`
			`self._open_tags_meta.popleft()`
			`self._open_tags_meta.appendleft(url)`

			`if EntityType and tag not in self._building_entities:`
			`self._building_entities[tag] = EntityType(`
			`offset=len(self.text),`
			`# The length will be determined when closing the tag.`
			`length=0,`
			`**args)`

			`def handle_data(self, text):`
			`previous_tag = self._open_tags[0] if len(self._open_tags) > 0 else ''`
			`if previous_tag == 'a':`
			`url = self._open_tags_meta[0]`
			`if url:`
			`text = url`

			`for tag, entity in self._building_entities.items():`
Make HTML parser yield the correct offset and lengths 2018-11-19 11:20:13 +03:00			`entity.length += len(text)`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00
			`self.text += text`

			`def handle_endtag(self, tag):`
			`try:`
			`self._open_tags.popleft()`
			`self._open_tags_meta.popleft()`
			`except IndexError:`
			`pass`
			`entity = self._building_entities.pop(tag, None)`
			`if entity:`
			`self.entities.append(entity)`


Add support for unparsing nested entities into HTML (#1209) 2019-06-24 13:28:14 +03:00			`def parse(html: str) -> Tuple[str, List[TypeMessageEntity]]:`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`"""`
			`Parses the given HTML message and returns its stripped representation`
			`plus a list of the MessageEntity's that were found.`

Fix several typos (#1328) 2019-11-10 13:29:43 +03:00			`:param html: the message with HTML to be parsed.`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`:return: a tuple consisting of (clean message, [message entities]).`
			`"""`
Fix parsers misbehaving with None text 2018-06-03 14:48:43 +03:00			`if not html:`
			`return html, []`

Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`parser = HTMLToTelegramParser()`
Fix HTML entity parsing failing when needing surrogates 2018-02-15 13:52:46 +03:00			`parser.feed(_add_surrogate(html))`
Locally strip outgoing message text respecting entities 2018-11-19 12:15:56 +03:00			`text = helpers.strip_text(parser.text, parser.entities)`
			`return _del_surrogate(text), parser.entities`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00

Add support for unparsing nested entities into HTML (#1209) 2019-06-24 13:28:14 +03:00			`def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0,`
			`_length: Optional[int] = None) -> str:`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`"""`
			`Performs the reverse operation to .parse(), effectively returning HTML`
			`given a normal text and its MessageEntity's.`

			`:param text: the text to be reconverted into HTML.`
			`:param entities: the MessageEntity's applied to the text.`
			`:return: a HTML representation of the combination of both inputs.`
			`"""`
Add support for unparsing nested entities into HTML (#1209) 2019-06-24 13:28:14 +03:00			`if not text:`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`return text`
Add support for unparsing nested entities into HTML (#1209) 2019-06-24 13:28:14 +03:00			`elif not entities:`
			`return escape(text)`
Fix HTML entity parsing failing when needing surrogates 2018-02-15 13:52:46 +03:00
			`text = _add_surrogate(text)`
Add support for unparsing nested entities into HTML (#1209) 2019-06-24 13:28:14 +03:00			`if _length is None:`
			`_length = len(text)`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`html = []`
			`last_offset = 0`
Add support for unparsing nested entities into HTML (#1209) 2019-06-24 13:28:14 +03:00			`for i, entity in enumerate(entities):`
Fix unparsing of entities that are together 2020-02-20 11:43:37 +03:00			`if entity.offset >= _offset + _length:`
Add support for unparsing nested entities into HTML (#1209) 2019-06-24 13:28:14 +03:00			`break`
			`relative_offset = entity.offset - _offset`
			`if relative_offset > last_offset:`
			`html.append(escape(text[last_offset:relative_offset]))`
			`elif relative_offset < last_offset:`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`continue`

			`skip_entity = False`
Fix unparsing text with malformed message entities 2019-12-19 17:48:27 +03:00			`length = entity.length`

			`# If we are in the middle of a surrogate nudge the position by +1.`
			`# Otherwise we would end up with malformed text and fail to encode.`
			`# For example of bad input: "Hi \ud83d\ude1c"`
			`# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF`
Fix within surrogate detection 2020-02-20 12:53:28 +03:00			`while helpers.within_surrogate(text, relative_offset, length=_length):`
Fix unparsing text with malformed message entities 2019-12-19 17:48:27 +03:00			`relative_offset += 1`

Fix within surrogate detection 2020-02-20 12:53:28 +03:00			`while helpers.within_surrogate(text, relative_offset + length, length=_length):`
Fix unparsing text with malformed message entities 2019-12-19 17:48:27 +03:00			`length += 1`

			`entity_text = unparse(text=text[relative_offset:relative_offset + length],`
Add support for unparsing nested entities into HTML (#1209) 2019-06-24 13:28:14 +03:00			`entities=entities[i + 1:],`
Fix unparsing text with malformed message entities 2019-12-19 17:48:27 +03:00			`_offset=entity.offset, _length=length)`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`entity_type = type(entity)`

			`if entity_type == MessageEntityBold:`
			`html.append('<strong>{}</strong>'.format(entity_text))`
			`elif entity_type == MessageEntityItalic:`
			`html.append('<em>{}</em>'.format(entity_text))`
			`elif entity_type == MessageEntityCode:`
			`html.append('<code>{}</code>'.format(entity_text))`
Add new message entities to markdown/html parsers 2019-06-23 22:35:33 +03:00			`elif entity_type == MessageEntityUnderline:`
			`html.append('<u>{}</u>'.format(entity_text))`
			`elif entity_type == MessageEntityStrike:`
			`html.append('<del>{}</del>'.format(entity_text))`
			`elif entity_type == MessageEntityBlockquote:`
			`html.append('<blockquote>{}</blockquote>'.format(entity_text))`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`elif entity_type == MessageEntityPre:`
			`if entity.language:`
			`html.append(`
			`"<pre>\n"`
			`" <code class='language-{}'>\n"`
			`" {}\n"`
			`" </code>\n"`
			`"</pre>".format(entity.language, entity_text))`
			`else:`
			`html.append('<pre><code>{}</code></pre>'`
			`.format(entity_text))`
			`elif entity_type == MessageEntityEmail:`
			`html.append('<a href="mailto:{0}">{0}</a>'.format(entity_text))`
			`elif entity_type == MessageEntityUrl:`
			`html.append('<a href="{0}">{0}</a>'.format(entity_text))`
			`elif entity_type == MessageEntityTextUrl:`
			`html.append('<a href="{}">{}</a>'`
			`.format(escape(entity.url), entity_text))`
Add name mention formatting to HTML and Markdown (#1019) 2018-10-04 16:56:32 +03:00			`elif entity_type == MessageEntityMentionName:`
			`html.append('<a href="tg://user?id={}">{}</a>'`
			`.format(entity.user_id, entity_text))`
Add HTML parse mode (#554) 2018-01-22 12:06:11 +03:00			`else:`
			`skip_entity = True`
Fix unparsing text with malformed message entities 2019-12-19 17:48:27 +03:00			`last_offset = relative_offset + (0 if skip_entity else length)`

Fix within surrogate detection 2020-02-20 12:53:28 +03:00			`while helpers.within_surrogate(text, last_offset, length=_length):`
Fix unparsing text with malformed message entities 2019-12-19 17:48:27 +03:00			`last_offset += 1`

Add support for unparsing nested entities into HTML (#1209) 2019-06-24 13:28:14 +03:00			`html.append(escape(text[last_offset:]))`
Fix HTML entity parsing failing when needing surrogates 2018-02-15 13:52:46 +03:00			`return _del_surrogate(''.join(html))`