Telethon/telethon/_misc/markdown.py

"""
Simple markdown parser which does not support nesting. Intended primarily
for use within the library, which attempts to handle emojies correctly,
since they seem to count as two characters and it's a bit strange.
"""
import re
import warnings
import markdown_it

from .helpers import add_surrogate, del_surrogate, within_surrogate, strip_text
from .. import _tl
from .._misc import tlobject


MARKDOWN = markdown_it.MarkdownIt().enable('strikethrough')
DELIMITERS = {
    _tl.MessageEntityBlockquote: ('> ', ''),
    _tl.MessageEntityBold: ('**', '**'),
    _tl.MessageEntityCode: ('`', '`'),
    _tl.MessageEntityItalic: ('_', '_'),
    _tl.MessageEntityStrike: ('~~', '~~'),
    _tl.MessageEntityUnderline: ('# ', ''),
}

# Not trying to be complete; just enough to have an alternative (mostly for inline underline).
# The fact headings are treated as underline is an implementation detail.
TAG_PATTERN = re.compile(r'<\s*(/?)\s*(\w+)')
HTML_TO_TYPE = {
    'i': ('em_close', 'em_open'),
    'em': ('em_close', 'em_open'),
    'b': ('strong_close', 'strong_open'),
    'strong': ('strong_close', 'strong_open'),
    's': ('s_close', 's_open'),
    'del': ('s_close', 's_open'),
    'u': ('heading_open', 'heading_close'),
    'mark': ('heading_open', 'heading_close'),
}


def expand_inline_and_html(tokens):
    for token in tokens:
        if token.type == 'inline':
            yield from expand_inline_and_html(token.children)
        elif token.type == 'html_inline':
            match = TAG_PATTERN.match(token.content)
            if match:
                close, tag = match.groups()
                tys = HTML_TO_TYPE.get(tag.lower())
                if tys:
                    token.type = tys[bool(close)]
                    token.nesting = -1 if close else 1
                    yield token
        else:
            yield token


def parse(message):
    """
    Parses the given markdown message and returns its stripped representation
    plus a list of the _tl.MessageEntity's that were found.
    """
    if not message:
        return message, []

    def push(ty, **extra):
        nonlocal message, entities, token
        if token.nesting > 0:
            entities.append(ty(offset=len(message), length=0, **extra))
        else:
            for entity in reversed(entities):
                if isinstance(entity, ty):
                    entity.length = len(message) - entity.offset
                    break

    parsed = MARKDOWN.parse(add_surrogate(message.strip()))
    message = ''
    entities = []
    last_map = [0, 0]
    for token in expand_inline_and_html(parsed):
        if token.map is not None and token.map != last_map:
            # paragraphs, quotes fences have a line mapping. Use it to determine how many newlines to insert.
            # But don't inssert any (leading) new lines if we're yet to reach the first textual content, or
            # if the mappings are the same (e.g. a quote then opens a paragraph but the mapping is equal).
            if message:
                message += '\n' + '\n' * (token.map[0] - last_map[-1])
            last_map = token.map

        if token.type in ('blockquote_close', 'blockquote_open'):
            push(_tl.MessageEntityBlockquote)
        elif token.type == 'code_block':
            entities.append(_tl.MessageEntityPre(offset=len(message), length=len(token.content), language=''))
            message += token.content
        elif token.type == 'code_inline':
            entities.append(_tl.MessageEntityCode(offset=len(message), length=len(token.content)))
            message += token.content
        elif token.type in ('em_close', 'em_open'):
            push(_tl.MessageEntityItalic)
        elif token.type == 'fence':
            entities.append(_tl.MessageEntityPre(offset=len(message), length=len(token.content), language=token.info))
            message += token.content[:-1]  # remove a single trailing newline
        elif token.type == 'hardbreak':
            message += '\n'
        elif token.type in ('heading_close', 'heading_open'):
            push(_tl.MessageEntityUnderline)
        elif token.type == 'hr':
            message += '\u2015\n\n'
        elif token.type in ('link_close', 'link_open'):
            if token.markup != 'autolink':  # telegram already picks up on these automatically
                push(_tl.MessageEntityTextUrl, url=token.attrs.get('href'))
        elif token.type in ('s_close', 's_open'):
            push(_tl.MessageEntityStrike)
        elif token.type == 'softbreak':
            message += ' '
        elif token.type in ('strong_close', 'strong_open'):
            push(_tl.MessageEntityBold)
        elif token.type == 'text':
            message += token.content

    return del_surrogate(message), entities


def unparse(text, entities):
    """
    Performs the reverse operation to .parse(), effectively returning
    markdown-like syntax given a normal text and its _tl.MessageEntity's.

    Because there are many possible ways for markdown to produce a certain
    output, this function cannot invert .parse() perfectly.
    """
    if not text or not entities:
        return text

    if isinstance(entities, tlobject.TLObject):
        entities = (entities,)

    text = add_surrogate(text)
    insert_at = []
    for entity in entities:
        s = entity.offset
        e = entity.offset + entity.length
        delimiter = DELIMITERS.get(type(entity), None)
        if delimiter:
            insert_at.append((s, delimiter[0]))
            insert_at.append((e, delimiter[1]))
        elif isinstance(entity, _tl.MessageEntityPre):
            insert_at.append((s, f'```{entity.language}\n'))
            insert_at.append((e, '```\n'))
        elif isinstance(entity, _tl.MessageEntityTextUrl):
            insert_at.append((s, '['))
            insert_at.append((e, f']({entity.url})'))
        elif isinstance(entity, _tl.MessageEntityMentionName):
            insert_at.append((s, '['))
            insert_at.append((e, f'](tg://user?id={entity.user_id})'))

    insert_at.sort(key=lambda t: t[0])
    while insert_at:
        at, what = insert_at.pop()

        # If we are in the middle of a surrogate nudge the position by -1.
        # Otherwise we would end up with malformed text and fail to encode.
        # For example of bad input: "Hi \ud83d\ude1c"
        # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
        while within_surrogate(text, at):
            at += 1

        text = text[:at] + what + text[at:]

    return del_surrogate(text)
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`"""`
			`Simple markdown parser which does not support nesting. Intended primarily`
			`for use within the library, which attempts to handle emojies correctly,`
			`since they seem to count as two characters and it's a bit strange.`
			`"""`
			`import re`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`import warnings`
Use a proper markdown parser 2021-09-25 21:33:25 +03:00			`import markdown_it`
Add method to md parser to extract text surrounded by entities 2017-11-16 21:13:13 +03:00
Fix imports 2021-09-12 14:27:13 +03:00			`from .helpers import add_surrogate, del_surrogate, within_surrogate, strip_text`
Replace most raw API usage with new location 2021-09-12 13:16:02 +03:00			`from .. import _tl`
Move alltlobjects.py and fix imports 2021-09-12 17:58:06 +03:00			`from .._misc import tlobject`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00
Use a proper markdown parser 2021-09-25 21:33:25 +03:00
			`MARKDOWN = markdown_it.MarkdownIt().enable('strikethrough')`
			`DELIMITERS = {`
			`_tl.MessageEntityBlockquote: ('> ', ''),`
			`_tl.MessageEntityBold: ('', ''),`
			_tl.MessageEntityCode: ('`', '`'),
			`_tl.MessageEntityItalic: ('_', '_'),`
			`_tl.MessageEntityStrike: ('~~', '~~'),`
			`_tl.MessageEntityUnderline: ('# ', ''),`
Use constants and allow empty URL regex when parsing markdown 2017-10-29 20:21:21 +03:00			`}`

Use a proper markdown parser 2021-09-25 21:33:25 +03:00			`# Not trying to be complete; just enough to have an alternative (mostly for inline underline).`
			`# The fact headings are treated as underline is an implementation detail.`
			`TAG_PATTERN = re.compile(r'<\s(/?)\s(\w+)')`
			`HTML_TO_TYPE = {`
			`'i': ('em_close', 'em_open'),`
			`'em': ('em_close', 'em_open'),`
			`'b': ('strong_close', 'strong_open'),`
			`'strong': ('strong_close', 'strong_open'),`
			`'s': ('s_close', 's_open'),`
			`'del': ('s_close', 's_open'),`
			`'u': ('heading_open', 'heading_close'),`
			`'mark': ('heading_open', 'heading_close'),`
			`}`
Add unparse markdown method 2017-11-26 19:16:59 +03:00
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00
Use a proper markdown parser 2021-09-25 21:33:25 +03:00			`def expand_inline_and_html(tokens):`
			`for token in tokens:`
			`if token.type == 'inline':`
			`yield from expand_inline_and_html(token.children)`
			`elif token.type == 'html_inline':`
			`match = TAG_PATTERN.match(token.content)`
			`if match:`
			`close, tag = match.groups()`
			`tys = HTML_TO_TYPE.get(tag.lower())`
			`if tys:`
			`token.type = tys[bool(close)]`
			`token.nesting = -1 if close else 1`
			`yield token`
			`else:`
			`yield token`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00

Use a proper markdown parser 2021-09-25 21:33:25 +03:00			`def parse(message):`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`"""`
Document the extensions/ module 2017-11-26 19:14:28 +03:00			`Parses the given markdown message and returns its stripped representation`
Replace most raw API usage with new location 2021-09-12 13:16:02 +03:00			`plus a list of the _tl.MessageEntity's that were found.`
Initial attempt at parsing Markdown-like syntax 2017-10-28 20:06:41 +03:00			`"""`
Fix parsers misbehaving with None text 2018-06-03 14:48:43 +03:00			`if not message:`
			`return message, []`

Use a proper markdown parser 2021-09-25 21:33:25 +03:00			`def push(ty, **extra):`
			`nonlocal message, entities, token`
			`if token.nesting > 0:`
			`entities.append(ty(offset=len(message), length=0, **extra))`
			`else:`
			`for entity in reversed(entities):`
			`if isinstance(entity, ty):`
			`entity.length = len(message) - entity.offset`
			`break`

			`parsed = MARKDOWN.parse(add_surrogate(message.strip()))`
			`message = ''`
			`entities = []`
			`last_map = [0, 0]`
			`for token in expand_inline_and_html(parsed):`
			`if token.map is not None and token.map != last_map:`
			`# paragraphs, quotes fences have a line mapping. Use it to determine how many newlines to insert.`
			`# But don't inssert any (leading) new lines if we're yet to reach the first textual content, or`
			`# if the mappings are the same (e.g. a quote then opens a paragraph but the mapping is equal).`
			`if message:`
			`message += '\n' + '\n' * (token.map[0] - last_map[-1])`
			`last_map = token.map`

			`if token.type in ('blockquote_close', 'blockquote_open'):`
			`push(_tl.MessageEntityBlockquote)`
			`elif token.type == 'code_block':`
			`entities.append(_tl.MessageEntityPre(offset=len(message), length=len(token.content), language=''))`
			`message += token.content`
			`elif token.type == 'code_inline':`
			`entities.append(_tl.MessageEntityCode(offset=len(message), length=len(token.content)))`
			`message += token.content`
			`elif token.type in ('em_close', 'em_open'):`
			`push(_tl.MessageEntityItalic)`
			`elif token.type == 'fence':`
			`entities.append(_tl.MessageEntityPre(offset=len(message), length=len(token.content), language=token.info))`
			`message += token.content[:-1] # remove a single trailing newline`
			`elif token.type == 'hardbreak':`
			`message += '\n'`
			`elif token.type in ('heading_close', 'heading_open'):`
			`push(_tl.MessageEntityUnderline)`
			`elif token.type == 'hr':`
			`message += '\u2015\n\n'`
			`elif token.type in ('link_close', 'link_open'):`
			`if token.markup != 'autolink': # telegram already picks up on these automatically`
			`push(_tl.MessageEntityTextUrl, url=token.attrs.get('href'))`
			`elif token.type in ('s_close', 's_open'):`
			`push(_tl.MessageEntityStrike)`
			`elif token.type == 'softbreak':`
			`message += ' '`
			`elif token.type in ('strong_close', 'strong_open'):`
			`push(_tl.MessageEntityBold)`
			`elif token.type == 'text':`
			`message += token.content`

			`return del_surrogate(message), entities`


			`def unparse(text, entities):`
Add unparse markdown method 2017-11-26 19:16:59 +03:00			`"""`
			`Performs the reverse operation to .parse(), effectively returning`
Replace most raw API usage with new location 2021-09-12 13:16:02 +03:00			`markdown-like syntax given a normal text and its _tl.MessageEntity's.`
Add unparse markdown method 2017-11-26 19:16:59 +03:00
Use a proper markdown parser 2021-09-25 21:33:25 +03:00			`Because there are many possible ways for markdown to produce a certain`
			`output, this function cannot invert .parse() perfectly.`
Add unparse markdown method 2017-11-26 19:16:59 +03:00			`"""`
Fix parsers misbehaving with None text 2018-06-03 14:48:43 +03:00			`if not text or not entities:`
Fix HTML entity parsing failing when needing surrogates 2018-02-15 13:52:46 +03:00			`return text`

Move alltlobjects.py and fix imports 2021-09-12 17:58:06 +03:00			`if isinstance(entities, tlobject.TLObject):`
Add unparse markdown method 2017-11-26 19:16:59 +03:00			`entities = (entities,)`

Avoid cyclic imports on older Python versions 2018-06-29 12:04:42 +03:00			`text = add_surrogate(text)`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`insert_at = []`
Add unparse markdown method 2017-11-26 19:16:59 +03:00			`for entity in entities:`
Stop working with bytes on the markdown parser 2018-01-07 18:18:54 +03:00			`s = entity.offset`
			`e = entity.offset + entity.length`
Use a proper markdown parser 2021-09-25 21:33:25 +03:00			`delimiter = DELIMITERS.get(type(entity), None)`
Add unparse markdown method 2017-11-26 19:16:59 +03:00			`if delimiter:`
Use a proper markdown parser 2021-09-25 21:33:25 +03:00			`insert_at.append((s, delimiter[0]))`
			`insert_at.append((e, delimiter[1]))`
			`elif isinstance(entity, _tl.MessageEntityPre):`
			insert_at.append((s, f'```{entity.language}\n'))
			insert_at.append((e, '```\n'))
			`elif isinstance(entity, _tl.MessageEntityTextUrl):`
			`insert_at.append((s, '['))`
			`insert_at.append((e, f']({entity.url})'))`
			`elif isinstance(entity, _tl.MessageEntityMentionName):`
			`insert_at.append((s, '['))`
			`insert_at.append((e, f'](tg://user?id={entity.user_id})'))`
Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00
			`insert_at.sort(key=lambda t: t[0])`
			`while insert_at:`
			`at, what = insert_at.pop()`
Fix unparsing text with malformed message entities 2019-12-19 17:48:27 +03:00
Fix within surrogate detection 2020-02-20 12:53:28 +03:00			`# If we are in the middle of a surrogate nudge the position by -1.`
Fix unparsing text with malformed message entities 2019-12-19 17:48:27 +03:00			`# Otherwise we would end up with malformed text and fail to encode.`
			`# For example of bad input: "Hi \ud83d\ude1c"`
			`# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF`
Fix within surrogate detection 2020-02-20 12:53:28 +03:00			`while within_surrogate(text, at):`
Fix unparsing text with malformed message entities 2019-12-19 17:48:27 +03:00			`at += 1`

Update markdown parser to support nested entities 2019-06-24 14:48:29 +03:00			`text = text[:at] + what + text[at:]`
Add unparse markdown method 2017-11-26 19:16:59 +03:00
Avoid cyclic imports on older Python versions 2018-06-29 12:04:42 +03:00			`return del_surrogate(text)`