mirror of
				https://github.com/LonamiWebs/Telethon.git
				synced 2025-11-04 01:47:27 +03:00 
			
		
		
		
	Use a proper markdown parser
This commit is contained in:
		
							parent
							
								
									1762f554df
								
							
						
					
					
						commit
						6fec2a68c5
					
				| 
						 | 
					@ -206,6 +206,27 @@ The ``telethon.errors`` module continues to provide custom errors used by the li
 | 
				
			||||||
// TODO should RpcError subclass ValueError? technically the values used in the request somehow were wrong…
 | 
					// TODO should RpcError subclass ValueError? technically the values used in the request somehow were wrong…
 | 
				
			||||||
// TODO provide a way to see which errors are known in the docs or at tl.telethon.dev
 | 
					// TODO provide a way to see which errors are known in the docs or at tl.telethon.dev
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The default markdown parse mode now conforms to the commonmark specification
 | 
				
			||||||
 | 
					----------------------------------------------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The old markdown parser (which was used as the default ``client.parse_mode``) used to emulate
 | 
				
			||||||
 | 
					Telegram Desktop's behaviour. Now `<markdown-it-py https://github.com/executablebooks/markdown-it-py>`__
 | 
				
			||||||
 | 
					is used instead, which fixes certain parsing bugs but also means the formatting will be different.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Most notably, ``__`` will now make text bold. If you want the old behaviour, use a single
 | 
				
			||||||
 | 
					underscore instead (such as ``_``). You can also use a single asterisk (``*``) for italics.
 | 
				
			||||||
 | 
					Because now there's proper parsing, you also gain:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* Headings (``# text``) will now be underlined.
 | 
				
			||||||
 | 
					* Certain HTML tags will now also be recognized in markdown (including ``<u>`` for underlining text).
 | 
				
			||||||
 | 
					* Line breaks behave properly now. For a single-line break, end your line with ``\\``.
 | 
				
			||||||
 | 
					* Inline links should no longer behave in a strange manner.
 | 
				
			||||||
 | 
					* Pre-blocks can now have a language. Official clients don't syntax highlight code yet, though.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// TODO provide a way to get back the old behaviour?
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The "iter" variant of the client methods have been removed
 | 
					The "iter" variant of the client methods have been removed
 | 
				
			||||||
----------------------------------------------------------
 | 
					----------------------------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,2 +1,3 @@
 | 
				
			||||||
pyaes
 | 
					markdown-it-py~=1.1.0
 | 
				
			||||||
rsa
 | 
					pyaes~=1.6.1
 | 
				
			||||||
 | 
					rsa~=4.7.2
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,177 +5,152 @@ since they seem to count as two characters and it's a bit strange.
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					import markdown_it
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .helpers import add_surrogate, del_surrogate, within_surrogate, strip_text
 | 
					from .helpers import add_surrogate, del_surrogate, within_surrogate, strip_text
 | 
				
			||||||
from .. import _tl
 | 
					from .. import _tl
 | 
				
			||||||
from .._misc import tlobject
 | 
					from .._misc import tlobject
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_DELIMITERS = {
 | 
					
 | 
				
			||||||
    '**': _tl.MessageEntityBold,
 | 
					MARKDOWN = markdown_it.MarkdownIt().enable('strikethrough')
 | 
				
			||||||
    '__': _tl.MessageEntityItalic,
 | 
					DELIMITERS = {
 | 
				
			||||||
    '~~': _tl.MessageEntityStrike,
 | 
					    _tl.MessageEntityBlockquote: ('> ', ''),
 | 
				
			||||||
    '`': _tl.MessageEntityCode,
 | 
					    _tl.MessageEntityBold: ('**', '**'),
 | 
				
			||||||
    '```': _tl.MessageEntityPre
 | 
					    _tl.MessageEntityCode: ('`', '`'),
 | 
				
			||||||
 | 
					    _tl.MessageEntityItalic: ('_', '_'),
 | 
				
			||||||
 | 
					    _tl.MessageEntityStrike: ('~~', '~~'),
 | 
				
			||||||
 | 
					    _tl.MessageEntityUnderline: ('# ', ''),
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_URL_RE = re.compile(r'\[([\S\s]+?)\]\((.+?)\)')
 | 
					# Not trying to be complete; just enough to have an alternative (mostly for inline underline).
 | 
				
			||||||
DEFAULT_URL_FORMAT = '[{0}]({1})'
 | 
					# The fact headings are treated as underline is an implementation detail.
 | 
				
			||||||
 | 
					TAG_PATTERN = re.compile(r'<\s*(/?)\s*(\w+)')
 | 
				
			||||||
 | 
					HTML_TO_TYPE = {
 | 
				
			||||||
 | 
					    'i': ('em_close', 'em_open'),
 | 
				
			||||||
 | 
					    'em': ('em_close', 'em_open'),
 | 
				
			||||||
 | 
					    'b': ('strong_close', 'strong_open'),
 | 
				
			||||||
 | 
					    'strong': ('strong_close', 'strong_open'),
 | 
				
			||||||
 | 
					    's': ('s_close', 's_open'),
 | 
				
			||||||
 | 
					    'del': ('s_close', 's_open'),
 | 
				
			||||||
 | 
					    'u': ('heading_open', 'heading_close'),
 | 
				
			||||||
 | 
					    'mark': ('heading_open', 'heading_close'),
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def overlap(a, b, x, y):
 | 
					def expand_inline_and_html(tokens):
 | 
				
			||||||
    return max(a, x) < min(b, y)
 | 
					    for token in tokens:
 | 
				
			||||||
 | 
					        if token.type == 'inline':
 | 
				
			||||||
 | 
					            yield from expand_inline_and_html(token.children)
 | 
				
			||||||
 | 
					        elif token.type == 'html_inline':
 | 
				
			||||||
 | 
					            match = TAG_PATTERN.match(token.content)
 | 
				
			||||||
 | 
					            if match:
 | 
				
			||||||
 | 
					                close, tag = match.groups()
 | 
				
			||||||
 | 
					                tys = HTML_TO_TYPE.get(tag.lower())
 | 
				
			||||||
 | 
					                if tys:
 | 
				
			||||||
 | 
					                    token.type = tys[bool(close)]
 | 
				
			||||||
 | 
					                    token.nesting = -1 if close else 1
 | 
				
			||||||
 | 
					                    yield token
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            yield token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def parse(message, delimiters=None, url_re=None):
 | 
					def parse(message):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Parses the given markdown message and returns its stripped representation
 | 
					    Parses the given markdown message and returns its stripped representation
 | 
				
			||||||
    plus a list of the _tl.MessageEntity's that were found.
 | 
					    plus a list of the _tl.MessageEntity's that were found.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    :param message: the message with markdown-like syntax to be parsed.
 | 
					 | 
				
			||||||
    :param delimiters: the delimiters to be used, {delimiter: type}.
 | 
					 | 
				
			||||||
    :param url_re: the URL bytes regex to be used. Must have two groups.
 | 
					 | 
				
			||||||
    :return: a tuple consisting of (clean message, [message entities]).
 | 
					 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    if not message:
 | 
					    if not message:
 | 
				
			||||||
        return message, []
 | 
					        return message, []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if url_re is None:
 | 
					    def push(ty, **extra):
 | 
				
			||||||
        url_re = DEFAULT_URL_RE
 | 
					        nonlocal message, entities, token
 | 
				
			||||||
    elif isinstance(url_re, str):
 | 
					        if token.nesting > 0:
 | 
				
			||||||
        url_re = re.compile(url_re)
 | 
					            entities.append(ty(offset=len(message), length=0, **extra))
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            for entity in reversed(entities):
 | 
				
			||||||
 | 
					                if isinstance(entity, ty):
 | 
				
			||||||
 | 
					                    entity.length = len(message) - entity.offset
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if not delimiters:
 | 
					    parsed = MARKDOWN.parse(add_surrogate(message.strip()))
 | 
				
			||||||
        if delimiters is not None:
 | 
					    message = ''
 | 
				
			||||||
            return message, []
 | 
					    entities = []
 | 
				
			||||||
        delimiters = DEFAULT_DELIMITERS
 | 
					    last_map = [0, 0]
 | 
				
			||||||
 | 
					    for token in expand_inline_and_html(parsed):
 | 
				
			||||||
 | 
					        if token.map is not None and token.map != last_map:
 | 
				
			||||||
 | 
					            # paragraphs, quotes fences have a line mapping. Use it to determine how many newlines to insert.
 | 
				
			||||||
 | 
					            # But don't inssert any (leading) new lines if we're yet to reach the first textual content, or
 | 
				
			||||||
 | 
					            # if the mappings are the same (e.g. a quote then opens a paragraph but the mapping is equal).
 | 
				
			||||||
 | 
					            if message:
 | 
				
			||||||
 | 
					                message += '\n' + '\n' * (token.map[0] - last_map[-1])
 | 
				
			||||||
 | 
					            last_map = token.map
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Build a regex to efficiently test all delimiters at once.
 | 
					        if token.type in ('blockquote_close', 'blockquote_open'):
 | 
				
			||||||
    # Note that the largest delimiter should go first, we don't
 | 
					            push(_tl.MessageEntityBlockquote)
 | 
				
			||||||
    # want ``` to be interpreted as a single back-tick in a code block.
 | 
					        elif token.type == 'code_block':
 | 
				
			||||||
    delim_re = re.compile('|'.join('({})'.format(re.escape(k))
 | 
					            entities.append(_tl.MessageEntityPre(offset=len(message), length=len(token.content), language=''))
 | 
				
			||||||
                                   for k in sorted(delimiters, key=len, reverse=True)))
 | 
					            message += token.content
 | 
				
			||||||
 | 
					        elif token.type == 'code_inline':
 | 
				
			||||||
 | 
					            entities.append(_tl.MessageEntityCode(offset=len(message), length=len(token.content)))
 | 
				
			||||||
 | 
					            message += token.content
 | 
				
			||||||
 | 
					        elif token.type in ('em_close', 'em_open'):
 | 
				
			||||||
 | 
					            push(_tl.MessageEntityItalic)
 | 
				
			||||||
 | 
					        elif token.type == 'fence':
 | 
				
			||||||
 | 
					            entities.append(_tl.MessageEntityPre(offset=len(message), length=len(token.content), language=token.info))
 | 
				
			||||||
 | 
					            message += token.content[:-1]  # remove a single trailing newline
 | 
				
			||||||
 | 
					        elif token.type == 'hardbreak':
 | 
				
			||||||
 | 
					            message += '\n'
 | 
				
			||||||
 | 
					        elif token.type in ('heading_close', 'heading_open'):
 | 
				
			||||||
 | 
					            push(_tl.MessageEntityUnderline)
 | 
				
			||||||
 | 
					        elif token.type == 'hr':
 | 
				
			||||||
 | 
					            message += '\u2015\n\n'
 | 
				
			||||||
 | 
					        elif token.type in ('link_close', 'link_open'):
 | 
				
			||||||
 | 
					            if token.markup != 'autolink':  # telegram already picks up on these automatically
 | 
				
			||||||
 | 
					                push(_tl.MessageEntityTextUrl, url=token.attrs.get('href'))
 | 
				
			||||||
 | 
					        elif token.type in ('s_close', 's_open'):
 | 
				
			||||||
 | 
					            push(_tl.MessageEntityStrike)
 | 
				
			||||||
 | 
					        elif token.type == 'softbreak':
 | 
				
			||||||
 | 
					            message += ' '
 | 
				
			||||||
 | 
					        elif token.type in ('strong_close', 'strong_open'):
 | 
				
			||||||
 | 
					            push(_tl.MessageEntityBold)
 | 
				
			||||||
 | 
					        elif token.type == 'text':
 | 
				
			||||||
 | 
					            message += token.content
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Cannot use a for loop because we need to skip some indices
 | 
					    return del_surrogate(message), entities
 | 
				
			||||||
    i = 0
 | 
					 | 
				
			||||||
    result = []
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Work on byte level with the utf-16le encoding to get the offsets right.
 | 
					 | 
				
			||||||
    # The offset will just be half the index we're at.
 | 
					 | 
				
			||||||
    message = add_surrogate(message)
 | 
					 | 
				
			||||||
    while i < len(message):
 | 
					 | 
				
			||||||
        m = delim_re.match(message, pos=i)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Did we find some delimiter here at `i`?
 | 
					 | 
				
			||||||
        if m:
 | 
					 | 
				
			||||||
            delim = next(filter(None, m.groups()))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # +1 to avoid matching right after (e.g. "****")
 | 
					 | 
				
			||||||
            end = message.find(delim, i + len(delim) + 1)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # Did we find the earliest closing tag?
 | 
					 | 
				
			||||||
            if end != -1:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                # Remove the delimiter from the string
 | 
					 | 
				
			||||||
                message = ''.join((
 | 
					 | 
				
			||||||
                        message[:i],
 | 
					 | 
				
			||||||
                        message[i + len(delim):end],
 | 
					 | 
				
			||||||
                        message[end + len(delim):]
 | 
					 | 
				
			||||||
                ))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                # Check other affected entities
 | 
					 | 
				
			||||||
                for ent in result:
 | 
					 | 
				
			||||||
                    # If the end is after our start, it is affected
 | 
					 | 
				
			||||||
                    if ent.offset + ent.length > i:
 | 
					 | 
				
			||||||
                        # If the old start is also before ours, it is fully enclosed
 | 
					 | 
				
			||||||
                        if ent.offset <= i:
 | 
					 | 
				
			||||||
                            ent.length -= len(delim) * 2
 | 
					 | 
				
			||||||
                        else:
 | 
					 | 
				
			||||||
                            ent.length -= len(delim)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                # Append the found entity
 | 
					 | 
				
			||||||
                ent = delimiters[delim]
 | 
					 | 
				
			||||||
                if ent == _tl.MessageEntityPre:
 | 
					 | 
				
			||||||
                    result.append(ent(i, end - i - len(delim), ''))  # has 'lang'
 | 
					 | 
				
			||||||
                else:
 | 
					 | 
				
			||||||
                    result.append(ent(i, end - i - len(delim)))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                # No nested entities inside code blocks
 | 
					 | 
				
			||||||
                if ent in (_tl.MessageEntityCode, _tl.MessageEntityPre):
 | 
					 | 
				
			||||||
                    i = end - len(delim)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        elif url_re:
 | 
					 | 
				
			||||||
            m = url_re.match(message, pos=i)
 | 
					 | 
				
			||||||
            if m:
 | 
					 | 
				
			||||||
                # Replace the whole match with only the inline URL text.
 | 
					 | 
				
			||||||
                message = ''.join((
 | 
					 | 
				
			||||||
                    message[:m.start()],
 | 
					 | 
				
			||||||
                    m.group(1),
 | 
					 | 
				
			||||||
                    message[m.end():]
 | 
					 | 
				
			||||||
                ))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                delim_size = m.end() - m.start() - len(m.group())
 | 
					 | 
				
			||||||
                for ent in result:
 | 
					 | 
				
			||||||
                    # If the end is after our start, it is affected
 | 
					 | 
				
			||||||
                    if ent.offset + ent.length > m.start():
 | 
					 | 
				
			||||||
                        ent.length -= delim_size
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                result.append(_tl.MessageEntityTextUrl(
 | 
					 | 
				
			||||||
                    offset=m.start(), length=len(m.group(1)),
 | 
					 | 
				
			||||||
                    url=del_surrogate(m.group(2))
 | 
					 | 
				
			||||||
                ))
 | 
					 | 
				
			||||||
                i += len(m.group(1))
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        i += 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    message = strip_text(message, result)
 | 
					 | 
				
			||||||
    return del_surrogate(message), result
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def unparse(text, entities, delimiters=None, url_fmt=None):
 | 
					def unparse(text, entities):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Performs the reverse operation to .parse(), effectively returning
 | 
					    Performs the reverse operation to .parse(), effectively returning
 | 
				
			||||||
    markdown-like syntax given a normal text and its _tl.MessageEntity's.
 | 
					    markdown-like syntax given a normal text and its _tl.MessageEntity's.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    :param text: the text to be reconverted into markdown.
 | 
					    Because there are many possible ways for markdown to produce a certain
 | 
				
			||||||
    :param entities: the _tl.MessageEntity's applied to the text.
 | 
					    output, this function cannot invert .parse() perfectly.
 | 
				
			||||||
    :return: a markdown-like text representing the combination of both inputs.
 | 
					 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    if not text or not entities:
 | 
					    if not text or not entities:
 | 
				
			||||||
        return text
 | 
					        return text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if not delimiters:
 | 
					 | 
				
			||||||
        if delimiters is not None:
 | 
					 | 
				
			||||||
            return text
 | 
					 | 
				
			||||||
        delimiters = DEFAULT_DELIMITERS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if url_fmt is not None:
 | 
					 | 
				
			||||||
        warnings.warn('url_fmt is deprecated')  # since it complicates everything *a lot*
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if isinstance(entities, tlobject.TLObject):
 | 
					    if isinstance(entities, tlobject.TLObject):
 | 
				
			||||||
        entities = (entities,)
 | 
					        entities = (entities,)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    text = add_surrogate(text)
 | 
					    text = add_surrogate(text)
 | 
				
			||||||
    delimiters = {v: k for k, v in delimiters.items()}
 | 
					 | 
				
			||||||
    insert_at = []
 | 
					    insert_at = []
 | 
				
			||||||
    for entity in entities:
 | 
					    for entity in entities:
 | 
				
			||||||
        s = entity.offset
 | 
					        s = entity.offset
 | 
				
			||||||
        e = entity.offset + entity.length
 | 
					        e = entity.offset + entity.length
 | 
				
			||||||
        delimiter = delimiters.get(type(entity), None)
 | 
					        delimiter = DELIMITERS.get(type(entity), None)
 | 
				
			||||||
        if delimiter:
 | 
					        if delimiter:
 | 
				
			||||||
            insert_at.append((s, delimiter))
 | 
					            insert_at.append((s, delimiter[0]))
 | 
				
			||||||
            insert_at.append((e, delimiter))
 | 
					            insert_at.append((e, delimiter[1]))
 | 
				
			||||||
        else:
 | 
					        elif isinstance(entity, _tl.MessageEntityPre):
 | 
				
			||||||
            url = None
 | 
					            insert_at.append((s, f'```{entity.language}\n'))
 | 
				
			||||||
            if isinstance(entity, _tl.MessageEntityTextUrl):
 | 
					            insert_at.append((e, '```\n'))
 | 
				
			||||||
                url = entity.url
 | 
					        elif isinstance(entity, _tl.MessageEntityTextUrl):
 | 
				
			||||||
            elif isinstance(entity, _tl.MessageEntityMentionName):
 | 
					            insert_at.append((s, '['))
 | 
				
			||||||
                url = 'tg://user?id={}'.format(entity.user_id)
 | 
					            insert_at.append((e, f']({entity.url})'))
 | 
				
			||||||
            if url:
 | 
					        elif isinstance(entity, _tl.MessageEntityMentionName):
 | 
				
			||||||
                insert_at.append((s, '['))
 | 
					            insert_at.append((s, '['))
 | 
				
			||||||
                insert_at.append((e, ']({})'.format(url)))
 | 
					            insert_at.append((e, f'](tg://user?id={entity.user_id})'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    insert_at.sort(key=lambda t: t[0])
 | 
					    insert_at.sort(key=lambda t: t[0])
 | 
				
			||||||
    while insert_at:
 | 
					    while insert_at:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user