Telethon/telethon/extensions/markdown.py

"""
Simple markdown parser which does not support nesting. Intended primarily
for use within the library, which attempts to handle emojies correctly,
since they seem to count as two characters and it's a bit strange.
"""
import re

from ..tl import TLObject
from ..tl.types import (
    MessageEntityBold, MessageEntityItalic, MessageEntityCode,
    MessageEntityPre, MessageEntityTextUrl
)
from ..utils import (
    add_surrogate as _add_surrogate,
    del_surrogate as _del_surrogate
)

DEFAULT_DELIMITERS = {
    '**': MessageEntityBold,
    '__': MessageEntityItalic,
    '`': MessageEntityCode,
    '```': MessageEntityPre
}

DEFAULT_URL_RE = re.compile(r'\[([\S\s]+?)\]\((.+?)\)')
DEFAULT_URL_FORMAT = '[{0}]({1})'


def parse(message, delimiters=None, url_re=None):
    """
    Parses the given markdown message and returns its stripped representation
    plus a list of the MessageEntity's that were found.

    :param message: the message with markdown-like syntax to be parsed.
    :param delimiters: the delimiters to be used, {delimiter: type}.
    :param url_re: the URL bytes regex to be used. Must have two groups.
    :return: a tuple consisting of (clean message, [message entities]).
    """
    if not message:
        return message, []

    if url_re is None:
        url_re = DEFAULT_URL_RE
    elif isinstance(url_re, str):
        url_re = re.compile(url_re)

    if not delimiters:
        if delimiters is not None:
            return message, []
        delimiters = DEFAULT_DELIMITERS

    # Cannot use a for loop because we need to skip some indices
    i = 0
    result = []
    current = None
    end_delimiter = None

    # Work on byte level with the utf-16le encoding to get the offsets right.
    # The offset will just be half the index we're at.
    message = _add_surrogate(message)
    while i < len(message):
        if url_re and current is None:
            # If we're not inside a previous match since Telegram doesn't allow
            # nested message entities, try matching the URL from the i'th pos.
            url_match = url_re.match(message, pos=i)
            if url_match:
                # Replace the whole match with only the inline URL text.
                message = ''.join((
                    message[:url_match.start()],
                    url_match.group(1),
                    message[url_match.end():]
                ))

                result.append(MessageEntityTextUrl(
                    offset=url_match.start(), length=len(url_match.group(1)),
                    url=_del_surrogate(url_match.group(2))
                ))
                i += len(url_match.group(1))
                # Next loop iteration, don't check delimiters, since
                # a new inline URL might be right after this one.
                continue

        if end_delimiter is None:
            # We're not expecting any delimiter, so check them all
            for d, m in delimiters.items():
                # Slice the string at the current i'th position to see if
                # it matches the current delimiter d, otherwise skip it.
                if message[i:i + len(d)] != d:
                    continue

                if message[i + len(d):i + 2 * len(d)] == d:
                    # The same delimiter can't be right afterwards, if
                    # this were the case we would match empty strings
                    # like `` which we don't want to.
                    continue

                # Get rid of the delimiter by slicing it away
                message = message[:i] + message[i + len(d):]
                if m == MessageEntityPre:
                    # Special case, also has 'lang'
                    current = m(i, None, '')
                else:
                    current = m(i, None)

                end_delimiter = d  # We expect the same delimiter.
                break

        elif message[i:i + len(end_delimiter)] == end_delimiter:
            message = message[:i] + message[i + len(end_delimiter):]
            current.length = i - current.offset
            result.append(current)
            current, end_delimiter = None, None
            # Don't increment i here as we matched a delimiter,
            # and there may be a new one right after. This is
            # different than when encountering the first delimiter,
            # as we already know there won't be the same right after.
            continue

        # Next iteration
        i += 1

    # We may have found some a delimiter but not its ending pair.
    # If this is the case, we want to insert the delimiter character back.
    if current is not None:
        message = (
            message[:current.offset]
            + end_delimiter
            + message[current.offset:]
        )

    return _del_surrogate(message), result


def unparse(text, entities, delimiters=None, url_fmt=None):
    """
    Performs the reverse operation to .parse(), effectively returning
    markdown-like syntax given a normal text and its MessageEntity's.

    :param text: the text to be reconverted into markdown.
    :param entities: the MessageEntity's applied to the text.
    :return: a markdown-like text representing the combination of both inputs.
    """
    if not text or not entities:
        return text

    if not delimiters:
        if delimiters is not None:
            return text
        delimiters = DEFAULT_DELIMITERS

    if url_fmt is None:
        url_fmt = DEFAULT_URL_FORMAT

    if isinstance(entities, TLObject):
        entities = (entities,)
    else:
        entities = tuple(sorted(entities, key=lambda e: e.offset, reverse=True))

    text = _add_surrogate(text)
    delimiters = {v: k for k, v in delimiters.items()}
    for entity in entities:
        s = entity.offset
        e = entity.offset + entity.length
        delimiter = delimiters.get(type(entity), None)
        if delimiter:
            text = text[:s] + delimiter + text[s:e] + delimiter + text[e:]
        elif isinstance(entity, MessageEntityTextUrl) and url_fmt:
            text = (
                text[:s] +
                _add_surrogate(url_fmt.format(text[s:e], entity.url)) +
                text[e:]
            )

    return _del_surrogate(text)
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`"""`
			`Simple markdown parser which does not support nesting. Intended primarily`
			`for use within the library, which attempts to handle emojies correctly,`
			`since they seem to count as two characters and it's a bit strange.`
			`"""`
			`import re`
Add method to md parser to extract text surrounded by entities 2017-11-16 21:13:13 +03:00
Fix import in markdown parser not being relative 2017-11-17 17:57:48 +03:00			`from ..tl import TLObject`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`from ..tl.types import (`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`MessageEntityBold, MessageEntityItalic, MessageEntityCode,`
			`MessageEntityPre, MessageEntityTextUrl`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`)`
Add some setters for custom.Message 2018-06-03 12:53:18 +03:00			`from ..utils import (`
			`add_surrogate as _add_surrogate,`
			`del_surrogate as _del_surrogate`
			`)`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00
Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`DEFAULT_DELIMITERS = {`
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`'**': MessageEntityBold,`
			`'__': MessageEntityItalic,`
			'`': MessageEntityCode,
			'```': MessageEntityPre
Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`}`

Fix markdown regex not supporting [] inside URLs 2018-03-22 21:01:50 +03:00			`DEFAULT_URL_RE = re.compile(r'\[([\S\s]+?)\]\((.+?)\)')`
Add unparse markdown method 2017-11-26 19:16:59 +03:00			`DEFAULT_URL_FORMAT = '[{0}]({1})'`

Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00
Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`def parse(message, delimiters=None, url_re=None):`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`"""`
Document the extensions/ module 2017-11-26 19:14:28 +03:00			`Parses the given markdown message and returns its stripped representation`
			`plus a list of the MessageEntity's that were found.`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00
Document the extensions/ module 2017-11-26 19:14:28 +03:00			`:param message: the message with markdown-like syntax to be parsed.`
			`:param delimiters: the delimiters to be used, {delimiter: type}.`
			`:param url_re: the URL bytes regex to be used. Must have two groups.`
			`:return: a tuple consisting of (clean message, [message entities]).`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`"""`
Fix parsers misbehaving with None text 2018-06-03 14:48:43 +03:00			`if not message:`
			`return message, []`

Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`if url_re is None:`
			`url_re = DEFAULT_URL_RE`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`elif isinstance(url_re, str):`
			`url_re = re.compile(url_re)`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`if not delimiters:`
			`if delimiters is not None:`
			`return message, []`
Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`delimiters = DEFAULT_DELIMITERS`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# Cannot use a for loop because we need to skip some indices`
Work on byte level when parsing markdown Reasoning: instead encoding every character one by one as we encounter them to use half their length as the correct offset, we can simply encode the whole string at once as utf-16le and work with that directly. 2017-11-06 12:29:32 +03:00			`i = 0`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`result = []`
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`current = None`
Clean up markdown parsing since tuples aren't used anymore 2017-11-10 13:41:49 +03:00			`end_delimiter = None`
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00
			`# Work on byte level with the utf-16le encoding to get the offsets right.`
			`# The offset will just be half the index we're at.`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`message = _add_surrogate(message)`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`while i < len(message):`
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`if url_re and current is None:`
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# If we're not inside a previous match since Telegram doesn't allow`
			`# nested message entities, try matching the URL from the i'th pos.`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`url_match = url_re.match(message, pos=i)`
			`if url_match:`
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# Replace the whole match with only the inline URL text.`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`message = ''.join((`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`message[:url_match.start()],`
			`url_match.group(1),`
			`message[url_match.end():]`
			`))`
Proper offset calculation for markdown (#407) Dan suca If Dan shared it with Traitor I'll not have to spend my time on this Not a, sorry for not letting you sleep k thx bye Will this stay in history? 2017-11-06 02:17:22 +03:00
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`result.append(MessageEntityTextUrl(`
Replace offset with match.start() to allow custom regex 2018-04-03 14:46:54 +03:00			`offset=url_match.start(), length=len(url_match.group(1)),`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`url=_del_surrogate(url_match.group(2))`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`))`
Clean up markdown parsing since tuples aren't used anymore 2017-11-10 13:41:49 +03:00			`i += len(url_match.group(1))`
			`# Next loop iteration, don't check delimiters, since`
			`# a new inline URL might be right after this one.`
			`continue`
Work on byte level when parsing markdown Reasoning: instead encoding every character one by one as we encounter them to use half their length as the correct offset, we can simply encode the whole string at once as utf-16le and work with that directly. 2017-11-06 12:29:32 +03:00
Clean up markdown parsing since tuples aren't used anymore 2017-11-10 13:41:49 +03:00			`if end_delimiter is None:`
			`# We're not expecting any delimiter, so check them all`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`for d, m in delimiters.items():`
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# Slice the string at the current i'th position to see if`
Clean up markdown parsing since tuples aren't used anymore 2017-11-10 13:41:49 +03:00			`# it matches the current delimiter d, otherwise skip it.`
			`if message[i:i + len(d)] != d:`
			`continue`

			`if message[i + len(d):i + 2 * len(d)] == d:`
			`# The same delimiter can't be right afterwards, if`
			`# this were the case we would match empty strings`
			# like `` which we don't want to.
			`continue`

			`# Get rid of the delimiter by slicing it away`
			`message = message[:i] + message[i + len(d):]`
			`if m == MessageEntityPre:`
			`# Special case, also has 'lang'`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`current = m(i, None, '')`
Clean up markdown parsing since tuples aren't used anymore 2017-11-10 13:41:49 +03:00			`else:`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`current = m(i, None)`
Clean up markdown parsing since tuples aren't used anymore 2017-11-10 13:41:49 +03:00
			`end_delimiter = d # We expect the same delimiter.`
			`break`

			`elif message[i:i + len(end_delimiter)] == end_delimiter:`
			`message = message[:i] + message[i + len(end_delimiter):]`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`current.length = i - current.offset`
Clean up markdown parsing since tuples aren't used anymore 2017-11-10 13:41:49 +03:00			`result.append(current)`
			`current, end_delimiter = None, None`
			`# Don't increment i here as we matched a delimiter,`
			`# and there may be a new one right after. This is`
			`# different than when encountering the first delimiter,`
			`# as we already know there won't be the same right after.`
			`continue`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`# Next iteration`
			`i += 1`
Fix markdown parsing failing if delimiter was last character 2017-10-28 20:17:18 +03:00
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`# We may have found some a delimiter but not its ending pair.`
Fix unfinished markdown delimiters being stripped away 2017-11-10 13:44:27 +03:00			`# If this is the case, we want to insert the delimiter character back.`
			`if current is not None:`
Fix c4e07cf, md parsing adding unfinished entity at wrong offset 2017-11-16 21:07:53 +03:00			`message = (`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`message[:current.offset]`
Fix c4e07cf, md parsing adding unfinished entity at wrong offset 2017-11-16 21:07:53 +03:00			`+ end_delimiter`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`+ message[current.offset:]`
Fix c4e07cf, md parsing adding unfinished entity at wrong offset 2017-11-16 21:07:53 +03:00			`)`
Proper offset calculation for markdown (#407) Dan suca If Dan shared it with Traitor I'll not have to spend my time on this Not a, sorry for not letting you sleep k thx bye Will this stay in history? 2017-11-06 02:17:22 +03:00
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`return _del_surrogate(message), result`
Add unparse markdown method 2017-11-26 19:16:59 +03:00

			`def unparse(text, entities, delimiters=None, url_fmt=None):`
			`"""`
			`Performs the reverse operation to .parse(), effectively returning`
			`markdown-like syntax given a normal text and its MessageEntity's.`

			`:param text: the text to be reconverted into markdown.`
			`:param entities: the MessageEntity's applied to the text.`
			`:return: a markdown-like text representing the combination of both inputs.`
			`"""`
Fix parsers misbehaving with None text 2018-06-03 14:48:43 +03:00			`if not text or not entities:`
Fix HTML entity parsing failing when needing surrogates 2018-02-15 13:52:46 +03:00			`return text`

Add unparse markdown method 2017-11-26 19:16:59 +03:00			`if not delimiters:`
			`if delimiters is not None:`
			`return text`
			`delimiters = DEFAULT_DELIMITERS`

			`if url_fmt is None:`
			`url_fmt = DEFAULT_URL_FORMAT`

			`if isinstance(entities, TLObject):`
			`entities = (entities,)`
			`else:`
			`entities = tuple(sorted(entities, key=lambda e: e.offset, reverse=True))`

Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`text = _add_surrogate(text)`
Fix markdown parser not inverting delimiters dict 2018-02-16 22:30:19 +03:00			`delimiters = {v: k for k, v in delimiters.items()}`
Add unparse markdown method 2017-11-26 19:16:59 +03:00			`for entity in entities:`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`s = entity.offset`
			`e = entity.offset + entity.length`
Add unparse markdown method 2017-11-26 19:16:59 +03:00			`delimiter = delimiters.get(type(entity), None)`
			`if delimiter:`
			`text = text[:s] + delimiter + text[s:e] + delimiter + text[e:]`
			`elif isinstance(entity, MessageEntityTextUrl) and url_fmt:`
			`text = (`
			`text[:s] +`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`_add_surrogate(url_fmt.format(text[s:e], entity.url)) +`
Add unparse markdown method 2017-11-26 19:16:59 +03:00			`text[e:]`
			`)`

Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`return _del_surrogate(text)`