diff --git a/telethon/extensions/html.py b/telethon/extensions/html.py index 0e2e8f33..9ca8e5f5 100644 --- a/telethon/extensions/html.py +++ b/telethon/extensions/html.py @@ -1,6 +1,7 @@ """ Simple HTML -> Telegram entity parser. """ + from collections import deque from html import escape from html.parser import HTMLParser @@ -9,18 +10,25 @@ from typing import Iterable, Tuple, List from ..helpers import add_surrogate, del_surrogate, within_surrogate, strip_text from ..tl import TLObject from ..tl.types import ( - MessageEntityBold, MessageEntityItalic, MessageEntityCode, - MessageEntityPre, MessageEntityEmail, MessageEntityUrl, - MessageEntityTextUrl, MessageEntityMentionName, - MessageEntityUnderline, MessageEntityStrike, MessageEntityBlockquote, - TypeMessageEntity + MessageEntityBold, + MessageEntityItalic, + MessageEntityCode, + MessageEntityPre, + MessageEntityEmail, + MessageEntityUrl, + MessageEntityTextUrl, + MessageEntityMentionName, + MessageEntityUnderline, + MessageEntityStrike, + MessageEntityBlockquote, + TypeMessageEntity, ) class HTMLToTelegramParser(HTMLParser): def __init__(self): super().__init__() - self.text = '' + self.text = "" self.entities = [] self._building_entities = {} self._open_tags = deque() @@ -33,17 +41,19 @@ class HTMLToTelegramParser(HTMLParser): attrs = dict(attrs) EntityType = None args = {} - if tag == 'strong' or tag == 'b': + if tag == "strong" or tag == "b": EntityType = MessageEntityBold - elif tag == 'em' or tag == 'i': + elif tag == "em" or tag == "i": EntityType = MessageEntityItalic - elif tag == 'u': + elif tag == "u": EntityType = MessageEntityUnderline - elif tag == 'del' or tag == 's': + elif tag == "del" or tag == "s": EntityType = MessageEntityStrike - elif tag == 'blockquote': + elif tag == "blockquote": EntityType = MessageEntityBlockquote - elif tag == 'code': + if "expandable" in attrs.keys(): + args["collapsed"] = True + elif tag == "code": try: # If we're in the middle of a
tag, this tag is
# probably intended for syntax highlighting.
@@ -51,30 +61,30 @@ class HTMLToTelegramParser(HTMLParser):
# Syntax highlighting is set with
# codeblock
# inside tags
- pre = self._building_entities['pre']
+ pre = self._building_entities["pre"]
try:
- pre.language = attrs['class'][len('language-'):]
+ pre.language = attrs["class"][len("language-") :]
except KeyError:
pass
except KeyError:
EntityType = MessageEntityCode
- elif tag == 'pre':
+ elif tag == "pre":
EntityType = MessageEntityPre
- args['language'] = ''
- elif tag == 'a':
+ args["language"] = ""
+ elif tag == "a":
try:
- url = attrs['href']
+ url = attrs["href"]
except KeyError:
return
- if url.startswith('mailto:'):
- url = url[len('mailto:'):]
+ if url.startswith("mailto:"):
+ url = url[len("mailto:") :]
EntityType = MessageEntityEmail
else:
if self.get_starttag_text() == url:
EntityType = MessageEntityUrl
else:
EntityType = MessageEntityTextUrl
- args['url'] = del_surrogate(url)
+ args["url"] = del_surrogate(url)
url = None
self._open_tags_meta.popleft()
self._open_tags_meta.appendleft(url)
@@ -84,11 +94,12 @@ class HTMLToTelegramParser(HTMLParser):
offset=len(self.text),
# The length will be determined when closing the tag.
length=0,
- **args)
+ **args,
+ )
def handle_data(self, text):
- previous_tag = self._open_tags[0] if len(self._open_tags) > 0 else ''
- if previous_tag == 'a':
+ previous_tag = self._open_tags[0] if len(self._open_tags) > 0 else ""
+ if previous_tag == "a":
url = self._open_tags_meta[0]
if url:
text = url
@@ -129,23 +140,23 @@ def parse(html: str) -> Tuple[str, List[TypeMessageEntity]]:
ENTITY_TO_FORMATTER = {
- MessageEntityBold: ('', ''),
- MessageEntityItalic: ('', ''),
- MessageEntityCode: ('', '
'),
- MessageEntityUnderline: ('', ''),
- MessageEntityStrike: ('', ''),
- MessageEntityBlockquote: ('', '
'),
+ MessageEntityBold: ("", ""),
+ MessageEntityItalic: ("", ""),
+ MessageEntityCode: ("", "
"),
+ MessageEntityUnderline: ("", ""),
+ MessageEntityStrike: ("", ""),
+ MessageEntityBlockquote: ("", "
"),
MessageEntityPre: lambda e, _: (
- "\n"
- " \n"
- " ".format(e.language), "{}\n"
- "
\n"
- "
"
+ "\n" " \n" " ".format(e.language),
+ "{}\n" "
\n" "
",
+ ),
+ MessageEntityEmail: lambda _, t: (''.format(t), ""),
+ MessageEntityUrl: lambda _, t: (''.format(t), ""),
+ MessageEntityTextUrl: lambda e, _: (''.format(escape(e.url)), ""),
+ MessageEntityMentionName: lambda e, _: (
+ ''.format(e.user_id),
+ "",
),
- MessageEntityEmail: lambda _, t: (''.format(t), ''),
- MessageEntityUrl: lambda _, t: (''.format(t), ''),
- MessageEntityTextUrl: lambda e, _: (''.format(escape(e.url)), ''),
- MessageEntityMentionName: lambda e, _: (''.format(e.user_id), ''),
}
@@ -186,7 +197,12 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity]) -> str:
while within_surrogate(text, at):
at += 1
- text = text[:at] + what + escape(text[at:next_escape_bound]) + text[next_escape_bound:]
+ text = (
+ text[:at]
+ + what
+ + escape(text[at:next_escape_bound])
+ + text[next_escape_bound:]
+ )
next_escape_bound = at
text = escape(text[:next_escape_bound]) + text[next_escape_bound:]