Telethon/telethon/extensions/markdown.py

"""
Simple markdown parser which does not support nesting. Intended primarily
for use within the library, which attempts to handle emojies correctly,
since they seem to count as two characters and it's a bit strange.
"""
import re
from ..tl.types import (
    MessageEntityBold, MessageEntityItalic, MessageEntityCode,
    MessageEntityPre, MessageEntityTextUrl
)


DEFAULT_DELIMITERS = {
    '**': MessageEntityBold,
    '__': MessageEntityItalic,
    '`': MessageEntityCode,
    '```': MessageEntityPre
}

# Regex used to match utf-16le encoded r'\[(.+?)\]\((.+?)\)',
# reason why there's '\0' after every match-literal character.
DEFAULT_URL_RE = re.compile(b'\\[\0(.+?)\\]\0\\(\0(.+?)\\)\0')


def parse(message, delimiters=None, url_re=None):
    """
    Parses the given message and returns the stripped message and a list
    of MessageEntity* using the specified delimiters dictionary (or default
    if None). The dictionary should be a mapping {delimiter: entity class}.

    The url_re(gex) must contain two matching groups: the text to be
    clickable and the URL itself, and be utf-16le encoded.
    """
    if url_re is None:
        url_re = DEFAULT_URL_RE
    elif url_re:
        if isinstance(url_re, bytes):
            url_re = re.compile(url_re)

    if not delimiters:
        if delimiters is not None:
            return message, []
        delimiters = DEFAULT_DELIMITERS

    delimiters = {k.encode('utf-16le'): v for k, v in delimiters.items()}

    # Cannot use a for loop because we need to skip some indices
    i = 0
    result = []
    current = None

    # Work on byte level with the utf-16le encoding to get the offsets right.
    # The offset will just be half the index we're at.
    message = message.encode('utf-16le')
    while i < len(message):
        url_match = None
        if url_re and current is None:
            # If we're not inside a previous match since Telegram doesn't allow
            # nested message entities, try matching the URL from the i'th pos.
            url_match = url_re.match(message, pos=i)
            if url_match:
                # Replace the whole match with only the inline URL text.
                message = b''.join((
                    message[:url_match.start()],
                    url_match.group(1),
                    message[url_match.end():]
                ))

                result.append(MessageEntityTextUrl(
                    offset=i // 2, length=len(url_match.group(1)) // 2,
                    url=url_match.group(2).decode('utf-16le')
                ))
                # We matched the delimiter which is now gone, and we'll add
                # +2 before next iteration which will make us skip a character.
                # Go back by one utf-16 encoded character (-2) to avoid it.
                i += len(url_match.group(1)) - 2

        if not url_match:
            for d, m in delimiters.items():
                # Slice the string at the current i'th position to see if
                # it matches the current delimiter d.
                if message[i:i + len(d)] == d:
                    if current is not None and not isinstance(current, m):
                        # We were inside another delimiter/mode, ignore this.
                        continue

                    if message[i + len(d):i + 2 * len(d)] == d:
                        # The same delimiter can't be right afterwards, if
                        # this were the case we would match empty strings
                        # like `` which we don't want to.
                        continue

                    # Get rid of the delimiter by slicing it away
                    message = message[:i] + message[i + len(d):]
                    if current is None:
                        if m == MessageEntityPre:
                            # Special case, also has 'lang'
                            current = MessageEntityPre(i // 2, None, '')
                        else:
                            current = m(i // 2, None)
                        # No need to i -= 2 here because it's been already
                        # checked that next character won't be a delimiter.
                    else:
                        current.length = (i // 2) - current.offset
                        result.append(current)
                        current = None
                        i -= 2  # Delimiter matched and gone, go back 1 char
                    break

        # Next iteration, utf-16 encoded characters need 2 bytes.
        i += 2

    # We may have found some a delimiter but not its ending pair.
    # TODO Should probably insert such delimiter back in the string.

    return message.decode('utf-16le'), result
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`"""`
			`Simple markdown parser which does not support nesting. Intended primarily`
			`for use within the library, which attempts to handle emojies correctly,`
			`since they seem to count as two characters and it's a bit strange.`
			`"""`
			`import re`
			`from ..tl.types import (`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`MessageEntityBold, MessageEntityItalic, MessageEntityCode,`
			`MessageEntityPre, MessageEntityTextUrl`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`)`


Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`DEFAULT_DELIMITERS = {`
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`'**': MessageEntityBold,`
			`'__': MessageEntityItalic,`
			'`': MessageEntityCode,
			'```': MessageEntityPre
Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`}`

Work on byte level when parsing markdown Reasoning: instead encoding every character one by one as we encounter them to use half their length as the correct offset, we can simply encode the whole string at once as utf-16le and work with that directly. 2017-11-06 12:29:32 +03:00			`# Regex used to match utf-16le encoded r'\[(.+?)\]\((.+?)\)',`
			`# reason why there's '\0' after every match-literal character.`
Fix URL regex for markdown was greedy (fix-up) 2017-11-06 13:22:58 +03:00			`DEFAULT_URL_RE = re.compile(b'\\[\0(.+?)\\]\0\\(\0(.+?)\\)\0')`
Work on byte level when parsing markdown Reasoning: instead encoding every character one by one as we encounter them to use half their length as the correct offset, we can simply encode the whole string at once as utf-16le and work with that directly. 2017-11-06 12:29:32 +03:00
Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00
			`def parse(message, delimiters=None, url_re=None):`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`"""`
			`Parses the given message and returns the stripped message and a list`
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`of MessageEntity* using the specified delimiters dictionary (or default`
			`if None). The dictionary should be a mapping {delimiter: entity class}.`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00
			`The url_re(gex) must contain two matching groups: the text to be`
Work on byte level when parsing markdown Reasoning: instead encoding every character one by one as we encounter them to use half their length as the correct offset, we can simply encode the whole string at once as utf-16le and work with that directly. 2017-11-06 12:29:32 +03:00			`clickable and the URL itself, and be utf-16le encoded.`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`"""`
Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`if url_re is None:`
			`url_re = DEFAULT_URL_RE`
			`elif url_re:`
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`if isinstance(url_re, bytes):`
			`url_re = re.compile(url_re)`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`if not delimiters:`
			`if delimiters is not None:`
			`return message, []`
Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`delimiters = DEFAULT_DELIMITERS`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00
Work on byte level when parsing markdown Reasoning: instead encoding every character one by one as we encounter them to use half their length as the correct offset, we can simply encode the whole string at once as utf-16le and work with that directly. 2017-11-06 12:29:32 +03:00			`delimiters = {k.encode('utf-16le'): v for k, v in delimiters.items()}`

Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# Cannot use a for loop because we need to skip some indices`
Work on byte level when parsing markdown Reasoning: instead encoding every character one by one as we encounter them to use half their length as the correct offset, we can simply encode the whole string at once as utf-16le and work with that directly. 2017-11-06 12:29:32 +03:00			`i = 0`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`result = []`
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`current = None`
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00
			`# Work on byte level with the utf-16le encoding to get the offsets right.`
			`# The offset will just be half the index we're at.`
Work on byte level when parsing markdown Reasoning: instead encoding every character one by one as we encounter them to use half their length as the correct offset, we can simply encode the whole string at once as utf-16le and work with that directly. 2017-11-06 12:29:32 +03:00			`message = message.encode('utf-16le')`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`while i < len(message):`
Fix inline URL matching swallowing all parse entities 2017-10-29 18:43:30 +03:00			`url_match = None`
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`if url_re and current is None:`
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# If we're not inside a previous match since Telegram doesn't allow`
			`# nested message entities, try matching the URL from the i'th pos.`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`url_match = url_re.match(message, pos=i)`
			`if url_match:`
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# Replace the whole match with only the inline URL text.`
Work on byte level when parsing markdown Reasoning: instead encoding every character one by one as we encounter them to use half their length as the correct offset, we can simply encode the whole string at once as utf-16le and work with that directly. 2017-11-06 12:29:32 +03:00			`message = b''.join((`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`message[:url_match.start()],`
			`url_match.group(1),`
			`message[url_match.end():]`
			`))`
Proper offset calculation for markdown (#407) Dan suca If Dan shared it with Traitor I'll not have to spend my time on this Not a, sorry for not letting you sleep k thx bye Will this stay in history? 2017-11-06 02:17:22 +03:00
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`result.append(MessageEntityTextUrl(`
			`offset=i // 2, length=len(url_match.group(1)) // 2,`
			`url=url_match.group(2).decode('utf-16le')`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`))`
Fix overlapping markdown entities being skipped 2017-11-06 12:37:22 +03:00			`# We matched the delimiter which is now gone, and we'll add`
			`# +2 before next iteration which will make us skip a character.`
			`# Go back by one utf-16 encoded character (-2) to avoid it.`
			`i += len(url_match.group(1)) - 2`
Work on byte level when parsing markdown Reasoning: instead encoding every character one by one as we encounter them to use half their length as the correct offset, we can simply encode the whole string at once as utf-16le and work with that directly. 2017-11-06 12:29:32 +03:00
Fix inline URL matching swallowing all parse entities 2017-10-29 18:43:30 +03:00			`if not url_match:`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`for d, m in delimiters.items():`
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# Slice the string at the current i'th position to see if`
			`# it matches the current delimiter d.`
			`if message[i:i + len(d)] == d:`
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`if current is not None and not isinstance(current, m):`
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# We were inside another delimiter/mode, ignore this.`
			`continue`

Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`if message[i + len(d):i + 2 * len(d)] == d:`
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# The same delimiter can't be right afterwards, if`
			`# this were the case we would match empty strings`
			# like `` which we don't want to.
			`continue`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# Get rid of the delimiter by slicing it away`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`message = message[:i] + message[i + len(d):]`
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`if current is None:`
			`if m == MessageEntityPre:`
			`# Special case, also has 'lang'`
			`current = MessageEntityPre(i // 2, None, '')`
			`else:`
			`current = m(i // 2, None)`
Fix overlapping markdown entities being skipped 2017-11-06 12:37:22 +03:00			`# No need to i -= 2 here because it's been already`
			`# checked that next character won't be a delimiter.`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`else:`
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`current.length = (i // 2) - current.offset`
			`result.append(current)`
			`current = None`
Fix overlapping markdown entities being skipped 2017-11-06 12:37:22 +03:00			`i -= 2 # Delimiter matched and gone, go back 1 char`
Add ability to parse inline URLs 2017-10-29 18:33:10 +03:00			`break`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00
Add more comments to the markdown parser 2017-11-06 13:32:40 +03:00			`# Next iteration, utf-16 encoded characters need 2 bytes.`
			`i += 2`
Fix markdown parsing failing if delimiter was last character 2017-10-28 20:17:18 +03:00
Make markdown parser use only Telegram's MessageEntity's 2017-11-10 13:01:02 +03:00			`# We may have found some a delimiter but not its ending pair.`
			`# TODO Should probably insert such delimiter back in the string.`
Proper offset calculation for markdown (#407) Dan suca If Dan shared it with Traitor I'll not have to spend my time on this Not a, sorry for not letting you sleep k thx bye Will this stay in history? 2017-11-06 02:17:22 +03:00
Work on byte level when parsing markdown Reasoning: instead encoding every character one by one as we encounter them to use half their length as the correct offset, we can simply encode the whole string at once as utf-16le and work with that directly. 2017-11-06 12:29:32 +03:00			`return message.decode('utf-16le'), result`