2017-10-28 20:06:41 +03:00
|
|
|
"""
|
|
|
|
Simple markdown parser which does not support nesting. Intended primarily
|
|
|
|
for use within the library, which attempts to handle emojies correctly,
|
|
|
|
since they seem to count as two characters and it's a bit strange.
|
|
|
|
"""
|
|
|
|
import re
|
2019-06-24 14:48:29 +03:00
|
|
|
import warnings
|
2021-09-25 21:33:25 +03:00
|
|
|
import markdown_it
|
2017-11-16 21:13:13 +03:00
|
|
|
|
2021-09-12 14:27:13 +03:00
|
|
|
from .helpers import add_surrogate, del_surrogate, within_surrogate, strip_text
|
2021-09-12 13:16:02 +03:00
|
|
|
from .. import _tl
|
2021-09-12 17:58:06 +03:00
|
|
|
from .._misc import tlobject
|
2017-10-28 20:06:41 +03:00
|
|
|
|
2021-09-25 21:33:25 +03:00
|
|
|
|
|
|
|
MARKDOWN = markdown_it.MarkdownIt().enable('strikethrough')
|
|
|
|
DELIMITERS = {
|
|
|
|
_tl.MessageEntityBlockquote: ('> ', ''),
|
|
|
|
_tl.MessageEntityBold: ('**', '**'),
|
|
|
|
_tl.MessageEntityCode: ('`', '`'),
|
|
|
|
_tl.MessageEntityItalic: ('_', '_'),
|
|
|
|
_tl.MessageEntityStrike: ('~~', '~~'),
|
|
|
|
_tl.MessageEntityUnderline: ('# ', ''),
|
2017-10-29 20:21:21 +03:00
|
|
|
}
|
|
|
|
|
2021-09-25 21:33:25 +03:00
|
|
|
# Not trying to be complete; just enough to have an alternative (mostly for inline underline).
|
|
|
|
# The fact headings are treated as underline is an implementation detail.
|
|
|
|
TAG_PATTERN = re.compile(r'<\s*(/?)\s*(\w+)')
|
|
|
|
HTML_TO_TYPE = {
|
|
|
|
'i': ('em_close', 'em_open'),
|
|
|
|
'em': ('em_close', 'em_open'),
|
|
|
|
'b': ('strong_close', 'strong_open'),
|
|
|
|
'strong': ('strong_close', 'strong_open'),
|
|
|
|
's': ('s_close', 's_open'),
|
|
|
|
'del': ('s_close', 's_open'),
|
|
|
|
'u': ('heading_open', 'heading_close'),
|
|
|
|
'mark': ('heading_open', 'heading_close'),
|
|
|
|
}
|
2017-11-26 19:16:59 +03:00
|
|
|
|
2018-01-07 18:18:54 +03:00
|
|
|
|
2021-09-25 21:33:25 +03:00
|
|
|
def expand_inline_and_html(tokens):
|
|
|
|
for token in tokens:
|
|
|
|
if token.type == 'inline':
|
|
|
|
yield from expand_inline_and_html(token.children)
|
|
|
|
elif token.type == 'html_inline':
|
|
|
|
match = TAG_PATTERN.match(token.content)
|
|
|
|
if match:
|
|
|
|
close, tag = match.groups()
|
|
|
|
tys = HTML_TO_TYPE.get(tag.lower())
|
|
|
|
if tys:
|
|
|
|
token.type = tys[bool(close)]
|
|
|
|
token.nesting = -1 if close else 1
|
|
|
|
yield token
|
|
|
|
else:
|
|
|
|
yield token
|
2019-06-24 14:48:29 +03:00
|
|
|
|
|
|
|
|
2021-09-25 21:33:25 +03:00
|
|
|
def parse(message):
|
2017-10-28 20:06:41 +03:00
|
|
|
"""
|
2017-11-26 19:14:28 +03:00
|
|
|
Parses the given markdown message and returns its stripped representation
|
2021-09-12 13:16:02 +03:00
|
|
|
plus a list of the _tl.MessageEntity's that were found.
|
2017-10-28 20:06:41 +03:00
|
|
|
"""
|
2018-06-03 14:48:43 +03:00
|
|
|
if not message:
|
|
|
|
return message, []
|
|
|
|
|
2021-09-25 21:33:25 +03:00
|
|
|
def push(ty, **extra):
|
|
|
|
nonlocal message, entities, token
|
|
|
|
if token.nesting > 0:
|
|
|
|
entities.append(ty(offset=len(message), length=0, **extra))
|
|
|
|
else:
|
|
|
|
for entity in reversed(entities):
|
|
|
|
if isinstance(entity, ty):
|
|
|
|
entity.length = len(message) - entity.offset
|
|
|
|
break
|
|
|
|
|
|
|
|
parsed = MARKDOWN.parse(add_surrogate(message.strip()))
|
|
|
|
message = ''
|
|
|
|
entities = []
|
|
|
|
last_map = [0, 0]
|
|
|
|
for token in expand_inline_and_html(parsed):
|
|
|
|
if token.map is not None and token.map != last_map:
|
|
|
|
# paragraphs, quotes fences have a line mapping. Use it to determine how many newlines to insert.
|
|
|
|
# But don't inssert any (leading) new lines if we're yet to reach the first textual content, or
|
|
|
|
# if the mappings are the same (e.g. a quote then opens a paragraph but the mapping is equal).
|
|
|
|
if message:
|
|
|
|
message += '\n' + '\n' * (token.map[0] - last_map[-1])
|
|
|
|
last_map = token.map
|
|
|
|
|
|
|
|
if token.type in ('blockquote_close', 'blockquote_open'):
|
|
|
|
push(_tl.MessageEntityBlockquote)
|
|
|
|
elif token.type == 'code_block':
|
|
|
|
entities.append(_tl.MessageEntityPre(offset=len(message), length=len(token.content), language=''))
|
|
|
|
message += token.content
|
|
|
|
elif token.type == 'code_inline':
|
|
|
|
entities.append(_tl.MessageEntityCode(offset=len(message), length=len(token.content)))
|
|
|
|
message += token.content
|
|
|
|
elif token.type in ('em_close', 'em_open'):
|
|
|
|
push(_tl.MessageEntityItalic)
|
|
|
|
elif token.type == 'fence':
|
|
|
|
entities.append(_tl.MessageEntityPre(offset=len(message), length=len(token.content), language=token.info))
|
|
|
|
message += token.content[:-1] # remove a single trailing newline
|
|
|
|
elif token.type == 'hardbreak':
|
|
|
|
message += '\n'
|
|
|
|
elif token.type in ('heading_close', 'heading_open'):
|
|
|
|
push(_tl.MessageEntityUnderline)
|
|
|
|
elif token.type == 'hr':
|
|
|
|
message += '\u2015\n\n'
|
|
|
|
elif token.type in ('link_close', 'link_open'):
|
|
|
|
if token.markup != 'autolink': # telegram already picks up on these automatically
|
|
|
|
push(_tl.MessageEntityTextUrl, url=token.attrs.get('href'))
|
|
|
|
elif token.type in ('s_close', 's_open'):
|
|
|
|
push(_tl.MessageEntityStrike)
|
|
|
|
elif token.type == 'softbreak':
|
|
|
|
message += ' '
|
|
|
|
elif token.type in ('strong_close', 'strong_open'):
|
|
|
|
push(_tl.MessageEntityBold)
|
|
|
|
elif token.type == 'text':
|
|
|
|
message += token.content
|
|
|
|
|
|
|
|
return del_surrogate(message), entities
|
|
|
|
|
|
|
|
|
|
|
|
def unparse(text, entities):
|
2017-11-26 19:16:59 +03:00
|
|
|
"""
|
|
|
|
Performs the reverse operation to .parse(), effectively returning
|
2021-09-12 13:16:02 +03:00
|
|
|
markdown-like syntax given a normal text and its _tl.MessageEntity's.
|
2017-11-26 19:16:59 +03:00
|
|
|
|
2021-09-25 21:33:25 +03:00
|
|
|
Because there are many possible ways for markdown to produce a certain
|
|
|
|
output, this function cannot invert .parse() perfectly.
|
2017-11-26 19:16:59 +03:00
|
|
|
"""
|
2018-06-03 14:48:43 +03:00
|
|
|
if not text or not entities:
|
2018-02-15 13:52:46 +03:00
|
|
|
return text
|
|
|
|
|
2021-09-12 17:58:06 +03:00
|
|
|
if isinstance(entities, tlobject.TLObject):
|
2017-11-26 19:16:59 +03:00
|
|
|
entities = (entities,)
|
|
|
|
|
2018-06-29 12:04:42 +03:00
|
|
|
text = add_surrogate(text)
|
2019-06-24 14:48:29 +03:00
|
|
|
insert_at = []
|
2017-11-26 19:16:59 +03:00
|
|
|
for entity in entities:
|
2018-01-07 18:18:54 +03:00
|
|
|
s = entity.offset
|
|
|
|
e = entity.offset + entity.length
|
2021-09-25 21:33:25 +03:00
|
|
|
delimiter = DELIMITERS.get(type(entity), None)
|
2017-11-26 19:16:59 +03:00
|
|
|
if delimiter:
|
2021-09-25 21:33:25 +03:00
|
|
|
insert_at.append((s, delimiter[0]))
|
|
|
|
insert_at.append((e, delimiter[1]))
|
|
|
|
elif isinstance(entity, _tl.MessageEntityPre):
|
|
|
|
insert_at.append((s, f'```{entity.language}\n'))
|
|
|
|
insert_at.append((e, '```\n'))
|
|
|
|
elif isinstance(entity, _tl.MessageEntityTextUrl):
|
|
|
|
insert_at.append((s, '['))
|
|
|
|
insert_at.append((e, f']({entity.url})'))
|
|
|
|
elif isinstance(entity, _tl.MessageEntityMentionName):
|
|
|
|
insert_at.append((s, '['))
|
|
|
|
insert_at.append((e, f'](tg://user?id={entity.user_id})'))
|
2019-06-24 14:48:29 +03:00
|
|
|
|
|
|
|
insert_at.sort(key=lambda t: t[0])
|
|
|
|
while insert_at:
|
|
|
|
at, what = insert_at.pop()
|
2019-12-19 17:48:27 +03:00
|
|
|
|
2020-02-20 12:53:28 +03:00
|
|
|
# If we are in the middle of a surrogate nudge the position by -1.
|
2019-12-19 17:48:27 +03:00
|
|
|
# Otherwise we would end up with malformed text and fail to encode.
|
|
|
|
# For example of bad input: "Hi \ud83d\ude1c"
|
|
|
|
# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
|
2020-02-20 12:53:28 +03:00
|
|
|
while within_surrogate(text, at):
|
2019-12-19 17:48:27 +03:00
|
|
|
at += 1
|
|
|
|
|
2019-06-24 14:48:29 +03:00
|
|
|
text = text[:at] + what + text[at:]
|
2017-11-26 19:16:59 +03:00
|
|
|
|
2018-06-29 12:04:42 +03:00
|
|
|
return del_surrogate(text)
|