Telethon/telethon/extensions/markdown.py

"""
Simple markdown parser which does not support nesting. Intended primarily
for use within the library, which attempts to handle emojies correctly,
since they seem to count as two characters and it's a bit strange.
"""
import re
import warnings

from ..helpers import add_surrogate, del_surrogate, within_surrogate, strip_text
from ..tl import TLObject
from ..tl.types import (
    MessageEntityBold, MessageEntityItalic, MessageEntityCode,
    MessageEntityPre, MessageEntityTextUrl, MessageEntityMentionName,
    MessageEntityStrike
)

DEFAULT_DELIMITERS = {
    '**': MessageEntityBold,
    '__': MessageEntityItalic,
    '~~': MessageEntityStrike,
    '`': MessageEntityCode,
    '```': MessageEntityPre
}

DEFAULT_URL_RE = re.compile(r'\[([\S\s]+?)\]\((.+?)\)')
DEFAULT_URL_FORMAT = '[{0}]({1})'


def overlap(a, b, x, y):
    return max(a, x) < min(b, y)


def parse(message, delimiters=None, url_re=None):
    """
    Parses the given markdown message and returns its stripped representation
    plus a list of the MessageEntity's that were found.

    :param message: the message with markdown-like syntax to be parsed.
    :param delimiters: the delimiters to be used, {delimiter: type}.
    :param url_re: the URL bytes regex to be used. Must have two groups.
    :return: a tuple consisting of (clean message, [message entities]).
    """
    if not message:
        return message, []

    if url_re is None:
        url_re = DEFAULT_URL_RE
    elif isinstance(url_re, str):
        url_re = re.compile(url_re)

    if not delimiters:
        if delimiters is not None:
            return message, []
        delimiters = DEFAULT_DELIMITERS

    # Build a regex to efficiently test all delimiters at once.
    # Note that the largest delimiter should go first, we don't
    # want ``` to be interpreted as a single back-tick in a code block.
    delim_re = re.compile('|'.join('({})'.format(re.escape(k))
                                   for k in sorted(delimiters, key=len, reverse=True)))

    # Cannot use a for loop because we need to skip some indices
    i = 0
    result = []

    # Work on byte level with the utf-16le encoding to get the offsets right.
    # The offset will just be half the index we're at.
    message = add_surrogate(message)
    while i < len(message):
        m = delim_re.match(message, pos=i)

        # Did we find some delimiter here at `i`?
        if m:
            delim = next(filter(None, m.groups()))

            # +1 to avoid matching right after (e.g. "****")
            end = message.find(delim, i + len(delim) + 1)

            # Did we find the earliest closing tag?
            if end != -1:

                # Remove the delimiter from the string
                message = ''.join((
                        message[:i],
                        message[i + len(delim):end],
                        message[end + len(delim):]
                ))

                # Check other affected entities
                for ent in result:
                    # If the end is after our start, it is affected
                    if ent.offset + ent.length > i:
                        # If the old start is also before ours, it is fully enclosed
                        if ent.offset <= i:
                            ent.length -= len(delim) * 2
                        else:
                            ent.length -= len(delim)

                # Append the found entity
                ent = delimiters[delim]
                if ent == MessageEntityPre:
                    result.append(ent(i, end - i - len(delim), ''))  # has 'lang'
                else:
                    result.append(ent(i, end - i - len(delim)))

                # No nested entities inside code blocks
                if ent in (MessageEntityCode, MessageEntityPre):
                    i = end - len(delim)

                continue

        elif url_re:
            m = url_re.match(message, pos=i)
            if m:
                # Replace the whole match with only the inline URL text.
                message = ''.join((
                    message[:m.start()],
                    m.group(1),
                    message[m.end():]
                ))

                delim_size = m.end() - m.start() - len(m.group())
                for ent in result:
                    # If the end is after our start, it is affected
                    if ent.offset + ent.length > m.start():
                        ent.length -= delim_size

                result.append(MessageEntityTextUrl(
                    offset=m.start(), length=len(m.group(1)),
                    url=del_surrogate(m.group(2))
                ))
                i += len(m.group(1))
                continue

        i += 1

    message = strip_text(message, result)
    return del_surrogate(message), result


def unparse(text, entities, delimiters=None, url_fmt=None):
    """
    Performs the reverse operation to .parse(), effectively returning
    markdown-like syntax given a normal text and its MessageEntity's.

    :param text: the text to be reconverted into markdown.
    :param entities: the MessageEntity's applied to the text.
    :return: a markdown-like text representing the combination of both inputs.
    """
    if not text or not entities:
        return text

    if not delimiters:
        if delimiters is not None:
            return text
        delimiters = DEFAULT_DELIMITERS

    if url_fmt is not None:
        warnings.warn('url_fmt is deprecated')  # since it complicates everything *a lot*

    if isinstance(entities, TLObject):
        entities = (entities,)

    text = add_surrogate(text)
    delimiters = {v: k for k, v in delimiters.items()}
    insert_at = []
    for entity in entities:
        s = entity.offset
        e = entity.offset + entity.length
        delimiter = delimiters.get(type(entity), None)
        if delimiter:
            insert_at.append((s, delimiter))
            insert_at.append((e, delimiter))
        else:
            url = None
            if isinstance(entity, MessageEntityTextUrl):
                url = entity.url
            elif isinstance(entity, MessageEntityMentionName):
                url = 'tg://user?id={}'.format(entity.user_id)
            if url:
                insert_at.append((s, '['))
                insert_at.append((e, ']({})'.format(url)))

    insert_at.sort(key=lambda t: t[0])
    while insert_at:
        at, what = insert_at.pop()

        # If we are in the middle of a surrogate nudge the position by -1.
        # Otherwise we would end up with malformed text and fail to encode.
        # For example of bad input: "Hi \ud83d\ude1c"
        # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
        while within_surrogate(text, at):
            at += 1

        text = text[:at] + what + text[at:]

    return del_surrogate(text)
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`"""`
			`Simple markdown parser which does not support nesting. Intended primarily`
			`for use within the library, which attempts to handle emojies correctly,`
			`since they seem to count as two characters and it's a bit strange.`
			`"""`
			`import re`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`import warnings`
Add method to md parser to extract text surrounded by entities 2017-11-16 21:13:13 +03:00
Fix within surrogate detection 2020-02-20 12:53:28 +03:00			`from ..helpers import add_surrogate, del_surrogate, within_surrogate, strip_text`
Fix import in markdown parser not being relative 2017-11-17 17:57:48 +03:00			`from ..tl import TLObject`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`from ..tl.types import (`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`MessageEntityBold, MessageEntityItalic, MessageEntityCode,`
Add new message entities to markdown/html parsers 2019-06-23 22:35:33 +03:00			`MessageEntityPre, MessageEntityTextUrl, MessageEntityMentionName,`
			`MessageEntityStrike`
Locally strip outgoing message text respecting entities 2018-11-19 12:15:56 +03:00			`)`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00
Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`DEFAULT_DELIMITERS = {`
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`'**': MessageEntityBold,`
			`'__': MessageEntityItalic,`
Add new message entities to markdown/html parsers 2019-06-23 22:35:33 +03:00			`'~~': MessageEntityStrike,`
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			'`': MessageEntityCode,
			'```': MessageEntityPre
Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`}`

Fix markdown regex not supporting [] inside URLs 2018-03-22 21:01:50 +03:00			`DEFAULT_URL_RE = re.compile(r'\[([\S\s]+?)\]\((.+?)\)')`
Add unparse markdown method 2017-11-26 19:16:59 +03:00			`DEFAULT_URL_FORMAT = '[{0}]({1})'`

Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`def overlap(a, b, x, y):`
			`return max(a, x) < min(b, y)`


Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`def parse(message, delimiters=None, url_re=None):`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`"""`
Document the extensions/ module 2017-11-26 19:14:28 +03:00			`Parses the given markdown message and returns its stripped representation`
			`plus a list of the MessageEntity's that were found.`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00
Document the extensions/ module 2017-11-26 19:14:28 +03:00			`:param message: the message with markdown-like syntax to be parsed.`
			`:param delimiters: the delimiters to be used, {delimiter: type}.`
			`:param url_re: the URL bytes regex to be used. Must have two groups.`
			`:return: a tuple consisting of (clean message, [message entities]).`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`"""`
Fix parsers misbehaving with None text 2018-06-03 14:48:43 +03:00			`if not message:`
			`return message, []`

Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`if url_re is None:`
			`url_re = DEFAULT_URL_RE`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`elif isinstance(url_re, str):`
			`url_re = re.compile(url_re)`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`if not delimiters:`
			`if delimiters is not None:`
			`return message, []`
Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`delimiters = DEFAULT_DELIMITERS`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00
Fix markdown parsing for pre blocks and entity after entity 2019-07-05 21:29:32 +03:00			`# Build a regex to efficiently test all delimiters at once.`
			`# Note that the largest delimiter should go first, we don't`
			# want ``` to be interpreted as a single back-tick in a code block.
			`delim_re = re.compile('\|'.join('({})'.format(re.escape(k))`
			`for k in sorted(delimiters, key=len, reverse=True)))`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# Cannot use a for loop because we need to skip some indices`
Work on byte level when parsing markdown Reasoning: instead encoding every character one by one as we encounter them to use half their length as the correct offset, we can simply encode the whole string at once as utf-16le and work with that directly. 2017-11-06 12:29:32 +03:00			`i = 0`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`result = []`
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00
			`# Work on byte level with the utf-16le encoding to get the offsets right.`
			`# The offset will just be half the index we're at.`
Avoid cyclic imports on older Python versions 2018-06-29 12:04:42 +03:00			`message = add_surrogate(message)`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`while i < len(message):`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`m = delim_re.match(message, pos=i)`

			# Did we find some delimiter here at `i`?
			`if m:`
			`delim = next(filter(None, m.groups()))`

			`# +1 to avoid matching right after (e.g. "****")`
			`end = message.find(delim, i + len(delim) + 1)`

			`# Did we find the earliest closing tag?`
			`if end != -1:`

			`# Remove the delimiter from the string`
			`message = ''.join((`
			`message[:i],`
			`message[i + len(delim):end],`
			`message[end + len(delim):]`
			`))`

			`# Check other affected entities`
			`for ent in result:`
			`# If the end is after our start, it is affected`
			`if ent.offset + ent.length > i:`
Fix directly nested markdown entities 2019-07-06 13:55:44 +03:00			`# If the old start is also before ours, it is fully enclosed`
			`if ent.offset <= i:`
			`ent.length -= len(delim) * 2`
			`else:`
			`ent.length -= len(delim)`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00
			`# Append the found entity`
			`ent = delimiters[delim]`
			`if ent == MessageEntityPre:`
			`result.append(ent(i, end - i - len(delim), '')) # has 'lang'`
			`else:`
			`result.append(ent(i, end - i - len(delim)))`

			`# No nested entities inside code blocks`
			`if ent in (MessageEntityCode, MessageEntityPre):`
Fix markdown parsing for pre blocks and entity after entity 2019-07-05 21:29:32 +03:00			`i = end - len(delim)`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00
			`continue`

			`elif url_re:`
			`m = url_re.match(message, pos=i)`
			`if m:`
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# Replace the whole match with only the inline URL text.`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`message = ''.join((`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`message[:m.start()],`
			`m.group(1),`
			`message[m.end():]`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`))`
Proper offset calculation for markdown (#407) Dan suca If Dan shared it with Traitor I'll not have to spend my time on this Not a, sorry for not letting you sleep k thx bye Will this stay in history? 2017-11-06 02:17:22 +03:00
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`delim_size = m.end() - m.start() - len(m.group())`
			`for ent in result:`
			`# If the end is after our start, it is affected`
			`if ent.offset + ent.length > m.start():`
			`ent.length -= delim_size`

Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`result.append(MessageEntityTextUrl(`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`offset=m.start(), length=len(m.group(1)),`
			`url=del_surrogate(m.group(2))`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`))`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`i += len(m.group(1))`
Clean up markdown parsing since tuples aren't used anymore 2017-11-10 13:41:49 +03:00			`continue`
Work on byte level when parsing markdown Reasoning: instead encoding every character one by one as we encounter them to use half their length as the correct offset, we can simply encode the whole string at once as utf-16le and work with that directly. 2017-11-06 12:29:32 +03:00
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`i += 1`
Fix markdown parsing failing if delimiter was last character 2017-10-28 20:17:18 +03:00
Locally strip outgoing message text respecting entities 2018-11-19 12:15:56 +03:00			`message = strip_text(message, result)`
Avoid cyclic imports on older Python versions 2018-06-29 12:04:42 +03:00			`return del_surrogate(message), result`
Add unparse markdown method 2017-11-26 19:16:59 +03:00

			`def unparse(text, entities, delimiters=None, url_fmt=None):`
			`"""`
			`Performs the reverse operation to .parse(), effectively returning`
			`markdown-like syntax given a normal text and its MessageEntity's.`

			`:param text: the text to be reconverted into markdown.`
			`:param entities: the MessageEntity's applied to the text.`
			`:return: a markdown-like text representing the combination of both inputs.`
			`"""`
Fix parsers misbehaving with None text 2018-06-03 14:48:43 +03:00			`if not text or not entities:`
Fix HTML entity parsing failing when needing surrogates 2018-02-15 13:52:46 +03:00			`return text`

Add unparse markdown method 2017-11-26 19:16:59 +03:00			`if not delimiters:`
			`if delimiters is not None:`
			`return text`
			`delimiters = DEFAULT_DELIMITERS`

Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`if url_fmt is not None:`
			`warnings.warn('url_fmt is deprecated') # since it complicates everything a lot`
Add unparse markdown method 2017-11-26 19:16:59 +03:00
			`if isinstance(entities, TLObject):`
			`entities = (entities,)`

Avoid cyclic imports on older Python versions 2018-06-29 12:04:42 +03:00			`text = add_surrogate(text)`
Fix markdown parser not inverting delimiters dict 2018-02-16 22:30:19 +03:00			`delimiters = {v: k for k, v in delimiters.items()}`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`insert_at = []`
Add unparse markdown method 2017-11-26 19:16:59 +03:00			`for entity in entities:`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`s = entity.offset`
			`e = entity.offset + entity.length`
Add unparse markdown method 2017-11-26 19:16:59 +03:00			`delimiter = delimiters.get(type(entity), None)`
			`if delimiter:`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`insert_at.append((s, delimiter))`
			`insert_at.append((e, delimiter))`
			`else:`
Add name mention formatting to HTML and Markdown (#1019) 2018-10-04 16:56:32 +03:00			`url = None`
			`if isinstance(entity, MessageEntityTextUrl):`
			`url = entity.url`
			`elif isinstance(entity, MessageEntityMentionName):`
			`url = 'tg://user?id={}'.format(entity.user_id)`
			`if url:`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`insert_at.append((s, '['))`
			`insert_at.append((e, ']({})'.format(url)))`

			`insert_at.sort(key=lambda t: t[0])`
			`while insert_at:`
			`at, what = insert_at.pop()`
Fix unparsing text with malformed message entities 2019-12-19 17:48:27 +03:00
Fix within surrogate detection 2020-02-20 12:53:28 +03:00			`# If we are in the middle of a surrogate nudge the position by -1.`
Fix unparsing text with malformed message entities 2019-12-19 17:48:27 +03:00			`# Otherwise we would end up with malformed text and fail to encode.`
			`# For example of bad input: "Hi \ud83d\ude1c"`
			`# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF`
Fix within surrogate detection 2020-02-20 12:53:28 +03:00			`while within_surrogate(text, at):`
Fix unparsing text with malformed message entities 2019-12-19 17:48:27 +03:00			`at += 1`

Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`text = text[:at] + what + text[at:]`
Add unparse markdown method 2017-11-26 19:16:59 +03:00
Avoid cyclic imports on older Python versions 2018-06-29 12:04:42 +03:00			`return del_surrogate(text)`