Telethon/telethon/_misc/markdown.py

169 lines
6.4 KiB
Python
Raw Normal View History

"""
Simple markdown parser which does not support nesting. Intended primarily
for use within the library, which attempts to handle emojies correctly,
since they seem to count as two characters and it's a bit strange.
"""
import re
import warnings
2021-09-25 21:33:25 +03:00
import markdown_it
2021-09-12 14:27:13 +03:00
from .helpers import add_surrogate, del_surrogate, within_surrogate, strip_text
from .. import _tl
2021-09-12 17:58:06 +03:00
from .._misc import tlobject
2021-09-25 21:33:25 +03:00
MARKDOWN = markdown_it.MarkdownIt().enable('strikethrough')
DELIMITERS = {
_tl.MessageEntityBlockquote: ('> ', ''),
_tl.MessageEntityBold: ('**', '**'),
_tl.MessageEntityCode: ('`', '`'),
_tl.MessageEntityItalic: ('_', '_'),
_tl.MessageEntityStrike: ('~~', '~~'),
_tl.MessageEntityUnderline: ('# ', ''),
}
2021-09-25 21:33:25 +03:00
# Not trying to be complete; just enough to have an alternative (mostly for inline underline).
# The fact headings are treated as underline is an implementation detail.
TAG_PATTERN = re.compile(r'<\s*(/?)\s*(\w+)')
HTML_TO_TYPE = {
'i': ('em_close', 'em_open'),
'em': ('em_close', 'em_open'),
'b': ('strong_close', 'strong_open'),
'strong': ('strong_close', 'strong_open'),
's': ('s_close', 's_open'),
'del': ('s_close', 's_open'),
'u': ('heading_open', 'heading_close'),
'mark': ('heading_open', 'heading_close'),
}
2017-11-26 19:16:59 +03:00
2021-09-25 21:33:25 +03:00
def expand_inline_and_html(tokens):
for token in tokens:
if token.type == 'inline':
yield from expand_inline_and_html(token.children)
elif token.type == 'html_inline':
match = TAG_PATTERN.match(token.content)
if match:
close, tag = match.groups()
tys = HTML_TO_TYPE.get(tag.lower())
if tys:
token.type = tys[bool(close)]
token.nesting = -1 if close else 1
yield token
else:
yield token
2021-09-25 21:33:25 +03:00
def parse(message):
"""
2017-11-26 19:14:28 +03:00
Parses the given markdown message and returns its stripped representation
plus a list of the _tl.MessageEntity's that were found.
"""
2018-06-03 14:48:43 +03:00
if not message:
return message, []
2021-09-25 21:33:25 +03:00
def push(ty, **extra):
nonlocal message, entities, token
if token.nesting > 0:
entities.append(ty(offset=len(message), length=0, **extra))
else:
for entity in reversed(entities):
if isinstance(entity, ty):
entity.length = len(message) - entity.offset
break
parsed = MARKDOWN.parse(add_surrogate(message.strip()))
message = ''
entities = []
last_map = [0, 0]
for token in expand_inline_and_html(parsed):
if token.map is not None and token.map != last_map:
# paragraphs, quotes fences have a line mapping. Use it to determine how many newlines to insert.
# But don't inssert any (leading) new lines if we're yet to reach the first textual content, or
# if the mappings are the same (e.g. a quote then opens a paragraph but the mapping is equal).
if message:
message += '\n' + '\n' * (token.map[0] - last_map[-1])
last_map = token.map
if token.type in ('blockquote_close', 'blockquote_open'):
push(_tl.MessageEntityBlockquote)
elif token.type == 'code_block':
entities.append(_tl.MessageEntityPre(offset=len(message), length=len(token.content), language=''))
message += token.content
elif token.type == 'code_inline':
entities.append(_tl.MessageEntityCode(offset=len(message), length=len(token.content)))
message += token.content
elif token.type in ('em_close', 'em_open'):
push(_tl.MessageEntityItalic)
elif token.type == 'fence':
entities.append(_tl.MessageEntityPre(offset=len(message), length=len(token.content), language=token.info))
message += token.content[:-1] # remove a single trailing newline
elif token.type == 'hardbreak':
message += '\n'
elif token.type in ('heading_close', 'heading_open'):
push(_tl.MessageEntityUnderline)
elif token.type == 'hr':
message += '\u2015\n\n'
elif token.type in ('link_close', 'link_open'):
if token.markup != 'autolink': # telegram already picks up on these automatically
push(_tl.MessageEntityTextUrl, url=token.attrs.get('href'))
elif token.type in ('s_close', 's_open'):
push(_tl.MessageEntityStrike)
elif token.type == 'softbreak':
message += ' '
elif token.type in ('strong_close', 'strong_open'):
push(_tl.MessageEntityBold)
elif token.type == 'text':
message += token.content
return del_surrogate(message), entities
def unparse(text, entities):
2017-11-26 19:16:59 +03:00
"""
Performs the reverse operation to .parse(), effectively returning
markdown-like syntax given a normal text and its _tl.MessageEntity's.
2017-11-26 19:16:59 +03:00
2021-09-25 21:33:25 +03:00
Because there are many possible ways for markdown to produce a certain
output, this function cannot invert .parse() perfectly.
2017-11-26 19:16:59 +03:00
"""
2018-06-03 14:48:43 +03:00
if not text or not entities:
return text
2021-09-12 17:58:06 +03:00
if isinstance(entities, tlobject.TLObject):
2017-11-26 19:16:59 +03:00
entities = (entities,)
text = add_surrogate(text)
insert_at = []
2017-11-26 19:16:59 +03:00
for entity in entities:
s = entity.offset
e = entity.offset + entity.length
2021-09-25 21:33:25 +03:00
delimiter = DELIMITERS.get(type(entity), None)
2017-11-26 19:16:59 +03:00
if delimiter:
2021-09-25 21:33:25 +03:00
insert_at.append((s, delimiter[0]))
insert_at.append((e, delimiter[1]))
elif isinstance(entity, _tl.MessageEntityPre):
insert_at.append((s, f'```{entity.language}\n'))
insert_at.append((e, '```\n'))
elif isinstance(entity, _tl.MessageEntityTextUrl):
insert_at.append((s, '['))
insert_at.append((e, f']({entity.url})'))
elif isinstance(entity, _tl.MessageEntityMentionName):
insert_at.append((s, '['))
insert_at.append((e, f'](tg://user?id={entity.user_id})'))
insert_at.sort(key=lambda t: t[0])
while insert_at:
at, what = insert_at.pop()
2020-02-20 12:53:28 +03:00
# If we are in the middle of a surrogate nudge the position by -1.
# Otherwise we would end up with malformed text and fail to encode.
# For example of bad input: "Hi \ud83d\ude1c"
# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
2020-02-20 12:53:28 +03:00
while within_surrogate(text, at):
at += 1
text = text[:at] + what + text[at:]
2017-11-26 19:16:59 +03:00
return del_surrogate(text)