mirror of
https://github.com/LonamiWebs/Telethon.git
synced 2025-02-24 15:30:48 +03:00
Bring client parsers for commonmark and HTML
This commit is contained in:
parent
69d7941852
commit
332215ea2e
|
@ -21,6 +21,7 @@ classifiers = [
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pyaes~=1.6",
|
"pyaes~=1.6",
|
||||||
"rsa~=4.9",
|
"rsa~=4.9",
|
||||||
|
"markdown-it-py~=3.0",
|
||||||
]
|
]
|
||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
|
|
||||||
|
|
0
client/src/telethon/_impl/client/__init__.py
Normal file
0
client/src/telethon/_impl/client/__init__.py
Normal file
11
client/src/telethon/_impl/client/parsers/__init__.py
Normal file
11
client/src/telethon/_impl/client/parsers/__init__.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
from .html import parse as parse_html_message
|
||||||
|
from .html import unparse as generate_html_message
|
||||||
|
from .markdown import parse as parse_markdown_message
|
||||||
|
from .markdown import unparse as generate_markdown_message
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"generate_html_message",
|
||||||
|
"parse_html_message",
|
||||||
|
"generate_markdown_message",
|
||||||
|
"parse_markdown_message",
|
||||||
|
]
|
207
client/src/telethon/_impl/client/parsers/html.py
Normal file
207
client/src/telethon/_impl/client/parsers/html.py
Normal file
|
@ -0,0 +1,207 @@
|
||||||
|
from collections import deque
|
||||||
|
from html import escape
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
from typing import Any, Deque, Dict, Iterable, List, Optional, Tuple, Type, cast
|
||||||
|
|
||||||
|
from ...tl.abcs import MessageEntity
|
||||||
|
from ...tl.types import (
|
||||||
|
MessageEntityBlockquote,
|
||||||
|
MessageEntityBold,
|
||||||
|
MessageEntityCode,
|
||||||
|
MessageEntityEmail,
|
||||||
|
MessageEntityItalic,
|
||||||
|
MessageEntityMentionName,
|
||||||
|
MessageEntityPre,
|
||||||
|
MessageEntitySpoiler,
|
||||||
|
MessageEntityStrike,
|
||||||
|
MessageEntityTextUrl,
|
||||||
|
MessageEntityUnderline,
|
||||||
|
MessageEntityUrl,
|
||||||
|
)
|
||||||
|
from .strings import add_surrogate, del_surrogate, strip_text, within_surrogate
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLToTelegramParser(HTMLParser):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.text = ""
|
||||||
|
self.entities: List[MessageEntity] = []
|
||||||
|
self._building_entities: Dict[str, MessageEntity] = {}
|
||||||
|
self._open_tags: Deque[str] = deque()
|
||||||
|
self._open_tags_meta: Deque[Optional[str]] = deque()
|
||||||
|
|
||||||
|
def handle_starttag(
|
||||||
|
self, tag: str, attrs_seq: List[Tuple[str, Optional[str]]]
|
||||||
|
) -> None:
|
||||||
|
self._open_tags.appendleft(tag)
|
||||||
|
self._open_tags_meta.appendleft(None)
|
||||||
|
|
||||||
|
attrs = dict(attrs_seq)
|
||||||
|
EntityType: Optional[Type[MessageEntity]] = None
|
||||||
|
args = {}
|
||||||
|
if tag == "strong" or tag == "b":
|
||||||
|
EntityType = MessageEntityBold
|
||||||
|
elif tag == "em" or tag == "i":
|
||||||
|
EntityType = MessageEntityItalic
|
||||||
|
elif tag == "u":
|
||||||
|
EntityType = MessageEntityUnderline
|
||||||
|
elif tag == "del" or tag == "s":
|
||||||
|
EntityType = MessageEntityStrike
|
||||||
|
elif tag == "blockquote":
|
||||||
|
EntityType = MessageEntityBlockquote
|
||||||
|
elif tag == "details":
|
||||||
|
EntityType = MessageEntitySpoiler
|
||||||
|
elif tag == "code":
|
||||||
|
try:
|
||||||
|
# If we're in the middle of a <pre> tag, this <code> tag is
|
||||||
|
# probably intended for syntax highlighting.
|
||||||
|
#
|
||||||
|
# Syntax highlighting is set with
|
||||||
|
# <code class='language-...'>codeblock</code>
|
||||||
|
# inside <pre> tags
|
||||||
|
pre = self._building_entities["pre"]
|
||||||
|
assert isinstance(pre, MessageEntityPre)
|
||||||
|
if cls := attrs.get("class"):
|
||||||
|
pre.language = cls[len("language-") :]
|
||||||
|
except KeyError:
|
||||||
|
EntityType = MessageEntityCode
|
||||||
|
elif tag == "pre":
|
||||||
|
EntityType = MessageEntityPre
|
||||||
|
args["language"] = ""
|
||||||
|
elif tag == "a":
|
||||||
|
url = attrs.get("href")
|
||||||
|
if not url:
|
||||||
|
return
|
||||||
|
if url.startswith("mailto:"):
|
||||||
|
url = url[len("mailto:") :]
|
||||||
|
EntityType = MessageEntityEmail
|
||||||
|
else:
|
||||||
|
if self.get_starttag_text() == url:
|
||||||
|
EntityType = MessageEntityUrl
|
||||||
|
else:
|
||||||
|
EntityType = MessageEntityTextUrl
|
||||||
|
args["url"] = del_surrogate(url)
|
||||||
|
url = None
|
||||||
|
self._open_tags_meta.popleft()
|
||||||
|
self._open_tags_meta.appendleft(url)
|
||||||
|
|
||||||
|
if EntityType and tag not in self._building_entities:
|
||||||
|
Et = cast(Any, EntityType)
|
||||||
|
self._building_entities[tag] = Et(
|
||||||
|
offset=len(self.text),
|
||||||
|
# The length will be determined when closing the tag.
|
||||||
|
length=0,
|
||||||
|
**args,
|
||||||
|
)
|
||||||
|
|
||||||
|
def handle_data(self, text: str) -> None:
|
||||||
|
previous_tag = self._open_tags[0] if len(self._open_tags) > 0 else ""
|
||||||
|
if previous_tag == "a":
|
||||||
|
url = self._open_tags_meta[0]
|
||||||
|
if url:
|
||||||
|
text = url
|
||||||
|
|
||||||
|
for entity in self._building_entities.values():
|
||||||
|
assert hasattr(entity, "length")
|
||||||
|
entity.length += len(text)
|
||||||
|
|
||||||
|
self.text += text
|
||||||
|
|
||||||
|
def handle_endtag(self, tag: str) -> None:
|
||||||
|
try:
|
||||||
|
self._open_tags.popleft()
|
||||||
|
self._open_tags_meta.popleft()
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
entity = self._building_entities.pop(tag, None)
|
||||||
|
if entity and hasattr(entity, "length") and entity.length:
|
||||||
|
self.entities.append(entity)
|
||||||
|
|
||||||
|
|
||||||
|
def parse(html: str) -> Tuple[str, List[MessageEntity]]:
|
||||||
|
"""
|
||||||
|
Parses the given HTML message and returns its stripped representation
|
||||||
|
plus a list of the MessageEntity's that were found.
|
||||||
|
|
||||||
|
:param html: the message with HTML to be parsed.
|
||||||
|
:return: a tuple consisting of (clean message, [message entities]).
|
||||||
|
"""
|
||||||
|
if not html:
|
||||||
|
return html, []
|
||||||
|
|
||||||
|
parser = HTMLToTelegramParser()
|
||||||
|
parser.feed(add_surrogate(html))
|
||||||
|
text = strip_text(parser.text, parser.entities)
|
||||||
|
return del_surrogate(text), parser.entities
|
||||||
|
|
||||||
|
|
||||||
|
ENTITY_TO_FORMATTER = {
|
||||||
|
MessageEntityBold: ("<strong>", "</strong>"),
|
||||||
|
MessageEntityItalic: ("<em>", "</em>"),
|
||||||
|
MessageEntityCode: ("<code>", "</code>"),
|
||||||
|
MessageEntityUnderline: ("<u>", "</u>"),
|
||||||
|
MessageEntityStrike: ("<del>", "</del>"),
|
||||||
|
MessageEntityBlockquote: ("<blockquote>", "</blockquote>"),
|
||||||
|
MessageEntitySpoiler: ("<details>", "</details>"),
|
||||||
|
MessageEntityPre: lambda e, _: (
|
||||||
|
'<pre><code class="language-{}">'.format(e.language) if e.language else "<pre>",
|
||||||
|
"</code></pre>" if e.language else "</pre>",
|
||||||
|
),
|
||||||
|
MessageEntityEmail: lambda _, t: ('<a href="mailto:{}">'.format(t), "</a>"),
|
||||||
|
MessageEntityUrl: lambda _, t: ('<a href="{}">'.format(t), "</a>"),
|
||||||
|
MessageEntityTextUrl: lambda e, _: ('<a href="{}">'.format(escape(e.url)), "</a>"),
|
||||||
|
MessageEntityMentionName: lambda e, _: (
|
||||||
|
'<a href="tg://user?id={}">'.format(e.user_id),
|
||||||
|
"</a>",
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def unparse(text: str, entities: Iterable[MessageEntity]) -> str:
|
||||||
|
"""
|
||||||
|
Performs the reverse operation to .parse(), effectively returning HTML
|
||||||
|
given a normal text and its MessageEntity's.
|
||||||
|
|
||||||
|
:param text: the text to be reconverted into HTML.
|
||||||
|
:param entities: the MessageEntity's applied to the text.
|
||||||
|
:return: a HTML representation of the combination of both inputs.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
elif not entities:
|
||||||
|
return escape(text)
|
||||||
|
|
||||||
|
text = add_surrogate(text)
|
||||||
|
insert_at: List[Tuple[int, str]] = []
|
||||||
|
for entity in entities:
|
||||||
|
assert hasattr(entity, "offset") and hasattr(entity, "length")
|
||||||
|
s = entity.offset
|
||||||
|
e = entity.offset + entity.length
|
||||||
|
delimiter = ENTITY_TO_FORMATTER.get(type(entity), None)
|
||||||
|
if delimiter:
|
||||||
|
if callable(delimiter):
|
||||||
|
delim = delimiter(entity, text[s:e])
|
||||||
|
else:
|
||||||
|
delim = delimiter
|
||||||
|
insert_at.append((s, delim[0]))
|
||||||
|
insert_at.append((e, delim[1]))
|
||||||
|
|
||||||
|
insert_at.sort(key=lambda t: t[0])
|
||||||
|
next_escape_bound = len(text)
|
||||||
|
while insert_at:
|
||||||
|
# Same logic as markdown.py
|
||||||
|
at, what = insert_at.pop()
|
||||||
|
while within_surrogate(text, at):
|
||||||
|
at += 1
|
||||||
|
|
||||||
|
text = (
|
||||||
|
text[:at]
|
||||||
|
+ what
|
||||||
|
+ escape(text[at:next_escape_bound])
|
||||||
|
+ text[next_escape_bound:]
|
||||||
|
)
|
||||||
|
next_escape_bound = at
|
||||||
|
|
||||||
|
text = escape(text[:next_escape_bound]) + text[next_escape_bound:]
|
||||||
|
|
||||||
|
return del_surrogate(text)
|
191
client/src/telethon/_impl/client/parsers/markdown.py
Normal file
191
client/src/telethon/_impl/client/parsers/markdown.py
Normal file
|
@ -0,0 +1,191 @@
|
||||||
|
import re
|
||||||
|
from typing import Any, Iterator, List, Tuple
|
||||||
|
|
||||||
|
import markdown_it
|
||||||
|
import markdown_it.token
|
||||||
|
|
||||||
|
from ...tl.abcs import MessageEntity
|
||||||
|
from ...tl.types import (
|
||||||
|
MessageEntityBlockquote,
|
||||||
|
MessageEntityBold,
|
||||||
|
MessageEntityCode,
|
||||||
|
MessageEntityItalic,
|
||||||
|
MessageEntityMentionName,
|
||||||
|
MessageEntityPre,
|
||||||
|
MessageEntityStrike,
|
||||||
|
MessageEntityTextUrl,
|
||||||
|
MessageEntityUnderline,
|
||||||
|
)
|
||||||
|
from .strings import add_surrogate, del_surrogate, within_surrogate
|
||||||
|
|
||||||
|
MARKDOWN = markdown_it.MarkdownIt().enable("strikethrough")
|
||||||
|
DELIMITERS = {
|
||||||
|
MessageEntityBlockquote: ("> ", ""),
|
||||||
|
MessageEntityBold: ("**", "**"),
|
||||||
|
MessageEntityCode: ("`", "`"),
|
||||||
|
MessageEntityItalic: ("_", "_"),
|
||||||
|
MessageEntityStrike: ("~~", "~~"),
|
||||||
|
MessageEntityUnderline: ("# ", ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Not trying to be complete; just enough to have an alternative (mostly for inline underline).
|
||||||
|
# The fact headings are treated as underline is an implementation detail.
|
||||||
|
TAG_PATTERN = re.compile(r"<\s*(/?)\s*(\w+)")
|
||||||
|
HTML_TO_TYPE = {
|
||||||
|
"i": ("em_close", "em_open"),
|
||||||
|
"em": ("em_close", "em_open"),
|
||||||
|
"b": ("strong_close", "strong_open"),
|
||||||
|
"strong": ("strong_close", "strong_open"),
|
||||||
|
"s": ("s_close", "s_open"),
|
||||||
|
"del": ("s_close", "s_open"),
|
||||||
|
"u": ("heading_open", "heading_close"),
|
||||||
|
"mark": ("heading_open", "heading_close"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def expand_inline_and_html(
|
||||||
|
tokens: List[markdown_it.token.Token],
|
||||||
|
) -> Iterator[markdown_it.token.Token]:
|
||||||
|
for token in tokens:
|
||||||
|
if token.type == "inline":
|
||||||
|
if token.children:
|
||||||
|
yield from expand_inline_and_html(token.children)
|
||||||
|
elif token.type == "html_inline":
|
||||||
|
match = TAG_PATTERN.match(token.content)
|
||||||
|
if match:
|
||||||
|
close, tag = match.groups()
|
||||||
|
tys = HTML_TO_TYPE.get(tag.lower())
|
||||||
|
if tys:
|
||||||
|
token.type = tys[bool(close)]
|
||||||
|
token.nesting = -1 if close else 1
|
||||||
|
yield token
|
||||||
|
else:
|
||||||
|
yield token
|
||||||
|
|
||||||
|
|
||||||
|
def parse(message: str) -> Tuple[str, List[MessageEntity]]:
|
||||||
|
"""
|
||||||
|
Parses the given markdown message and returns its stripped representation
|
||||||
|
plus a list of the MessageEntity's that were found.
|
||||||
|
"""
|
||||||
|
if not message:
|
||||||
|
return message, []
|
||||||
|
|
||||||
|
entities: List[MessageEntity]
|
||||||
|
token: markdown_it.token.Token
|
||||||
|
|
||||||
|
def push(ty: Any, **extra: object) -> None:
|
||||||
|
nonlocal message, entities, token
|
||||||
|
if token.nesting > 0:
|
||||||
|
entities.append(ty(offset=len(message), length=0, **extra))
|
||||||
|
else:
|
||||||
|
for entity in reversed(entities):
|
||||||
|
if isinstance(entity, ty):
|
||||||
|
entity.length = len(message) - entity.offset
|
||||||
|
break
|
||||||
|
|
||||||
|
parsed = MARKDOWN.parse(add_surrogate(message.strip()))
|
||||||
|
message = ""
|
||||||
|
entities = []
|
||||||
|
last_map = [0, 0]
|
||||||
|
for token in expand_inline_and_html(parsed):
|
||||||
|
if token.map is not None and token.map != last_map:
|
||||||
|
# paragraphs, quotes fences have a line mapping. Use it to determine how many newlines to insert.
|
||||||
|
# But don't inssert any (leading) new lines if we're yet to reach the first textual content, or
|
||||||
|
# if the mappings are the same (e.g. a quote then opens a paragraph but the mapping is equal).
|
||||||
|
if message:
|
||||||
|
message += "\n" + "\n" * (token.map[0] - last_map[-1])
|
||||||
|
last_map = token.map
|
||||||
|
|
||||||
|
if token.type in ("blockquote_close", "blockquote_open"):
|
||||||
|
push(MessageEntityBlockquote)
|
||||||
|
elif token.type == "code_block":
|
||||||
|
entities.append(
|
||||||
|
MessageEntityPre(
|
||||||
|
offset=len(message), length=len(token.content), language=""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
message += token.content
|
||||||
|
elif token.type == "code_inline":
|
||||||
|
entities.append(
|
||||||
|
MessageEntityCode(offset=len(message), length=len(token.content))
|
||||||
|
)
|
||||||
|
message += token.content
|
||||||
|
elif token.type in ("em_close", "em_open"):
|
||||||
|
push(MessageEntityItalic)
|
||||||
|
elif token.type == "fence":
|
||||||
|
entities.append(
|
||||||
|
MessageEntityPre(
|
||||||
|
offset=len(message), length=len(token.content), language=token.info
|
||||||
|
)
|
||||||
|
)
|
||||||
|
message += token.content[:-1] # remove a single trailing newline
|
||||||
|
elif token.type == "hardbreak":
|
||||||
|
message += "\n"
|
||||||
|
elif token.type in ("heading_close", "heading_open"):
|
||||||
|
push(MessageEntityUnderline)
|
||||||
|
elif token.type == "hr":
|
||||||
|
message += "\u2015\n\n"
|
||||||
|
elif token.type in ("link_close", "link_open"):
|
||||||
|
if (
|
||||||
|
token.markup != "autolink"
|
||||||
|
): # telegram already picks up on these automatically
|
||||||
|
push(MessageEntityTextUrl, url=token.attrs.get("href"))
|
||||||
|
elif token.type in ("s_close", "s_open"):
|
||||||
|
push(MessageEntityStrike)
|
||||||
|
elif token.type == "softbreak":
|
||||||
|
message += " "
|
||||||
|
elif token.type in ("strong_close", "strong_open"):
|
||||||
|
push(MessageEntityBold)
|
||||||
|
elif token.type == "text":
|
||||||
|
message += token.content
|
||||||
|
|
||||||
|
return del_surrogate(message), entities
|
||||||
|
|
||||||
|
|
||||||
|
def unparse(text: str, entities: List[MessageEntity]) -> str:
|
||||||
|
"""
|
||||||
|
Performs the reverse operation to .parse(), effectively returning
|
||||||
|
markdown-like syntax given a normal text and its MessageEntity's.
|
||||||
|
|
||||||
|
Because there are many possible ways for markdown to produce a certain
|
||||||
|
output, this function cannot invert .parse() perfectly.
|
||||||
|
"""
|
||||||
|
if not text or not entities:
|
||||||
|
return text
|
||||||
|
|
||||||
|
text = add_surrogate(text)
|
||||||
|
insert_at: List[Tuple[int, str]] = []
|
||||||
|
for entity in entities:
|
||||||
|
assert hasattr(entity, "offset")
|
||||||
|
assert hasattr(entity, "length")
|
||||||
|
s = entity.offset
|
||||||
|
e = entity.offset + entity.length
|
||||||
|
delimiter = DELIMITERS.get(type(entity), None)
|
||||||
|
if delimiter:
|
||||||
|
insert_at.append((s, delimiter[0]))
|
||||||
|
insert_at.append((e, delimiter[1]))
|
||||||
|
elif isinstance(entity, MessageEntityPre):
|
||||||
|
insert_at.append((s, f"```{entity.language}\n"))
|
||||||
|
insert_at.append((e, "```\n"))
|
||||||
|
elif isinstance(entity, MessageEntityTextUrl):
|
||||||
|
insert_at.append((s, "["))
|
||||||
|
insert_at.append((e, f"]({entity.url})"))
|
||||||
|
elif isinstance(entity, MessageEntityMentionName):
|
||||||
|
insert_at.append((s, "["))
|
||||||
|
insert_at.append((e, f"](tg://user?id={entity.user_id})"))
|
||||||
|
|
||||||
|
insert_at.sort(key=lambda t: t[0])
|
||||||
|
while insert_at:
|
||||||
|
at, what = insert_at.pop()
|
||||||
|
|
||||||
|
# If we are in the middle of a surrogate nudge the position by -1.
|
||||||
|
# Otherwise we would end up with malformed text and fail to encode.
|
||||||
|
# For example of bad input: "Hi \ud83d\ude1c"
|
||||||
|
# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
|
||||||
|
while within_surrogate(text, at):
|
||||||
|
at += 1
|
||||||
|
|
||||||
|
text = text[:at] + what + text[at:]
|
||||||
|
|
||||||
|
return del_surrogate(text)
|
75
client/src/telethon/_impl/client/parsers/strings.py
Normal file
75
client/src/telethon/_impl/client/parsers/strings.py
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
import struct
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from ...tl.abcs import MessageEntity
|
||||||
|
|
||||||
|
|
||||||
|
def add_surrogate(text: str) -> str:
|
||||||
|
return "".join(
|
||||||
|
# SMP -> Surrogate Pairs (Telegram offsets are calculated with these).
|
||||||
|
# See https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview for more.
|
||||||
|
"".join(chr(y) for y in struct.unpack("<HH", x.encode("utf-16le")))
|
||||||
|
if (0x10000 <= ord(x) <= 0x10FFFF)
|
||||||
|
else x
|
||||||
|
for x in text
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def del_surrogate(text: str) -> str:
|
||||||
|
return text.encode("utf-16", "surrogatepass").decode("utf-16")
|
||||||
|
|
||||||
|
|
||||||
|
def within_surrogate(text: str, index: int, *, length: Optional[int] = None) -> bool:
|
||||||
|
"""
|
||||||
|
`True` if ``index`` is within a surrogate (before and after it, not at!).
|
||||||
|
"""
|
||||||
|
if length is None:
|
||||||
|
length = len(text)
|
||||||
|
|
||||||
|
return (
|
||||||
|
1 < index < len(text) # in bounds
|
||||||
|
and "\ud800" <= text[index - 1] <= "\udfff" # previous is
|
||||||
|
and "\ud800" <= text[index] <= "\udfff" # current is
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def strip_text(text: str, entities: List[MessageEntity]) -> str:
|
||||||
|
"""
|
||||||
|
Strips whitespace from the given text modifying the provided entities.
|
||||||
|
|
||||||
|
This assumes that there are no overlapping entities, that their length
|
||||||
|
is greater or equal to one, and that their length is not out of bounds.
|
||||||
|
"""
|
||||||
|
if not entities:
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
while text and text[-1].isspace():
|
||||||
|
e = entities[-1]
|
||||||
|
assert hasattr(e, "offset") and hasattr(e, "length")
|
||||||
|
if e.offset + e.length == len(text):
|
||||||
|
if e.length == 1:
|
||||||
|
del entities[-1]
|
||||||
|
if not entities:
|
||||||
|
return text.strip()
|
||||||
|
else:
|
||||||
|
e.length -= 1
|
||||||
|
text = text[:-1]
|
||||||
|
|
||||||
|
while text and text[0].isspace():
|
||||||
|
for i in reversed(range(len(entities))):
|
||||||
|
e = entities[i]
|
||||||
|
assert hasattr(e, "offset") and hasattr(e, "length")
|
||||||
|
if e.offset != 0:
|
||||||
|
e.offset -= 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if e.length == 1:
|
||||||
|
del entities[0]
|
||||||
|
if not entities:
|
||||||
|
return text.lstrip()
|
||||||
|
else:
|
||||||
|
e.length -= 1
|
||||||
|
|
||||||
|
text = text[1:]
|
||||||
|
|
||||||
|
return text
|
151
client/tests/test_parsers.py
Normal file
151
client/tests/test_parsers.py
Normal file
|
@ -0,0 +1,151 @@
|
||||||
|
from telethon._impl.client.parsers import (
|
||||||
|
generate_html_message,
|
||||||
|
generate_markdown_message,
|
||||||
|
parse_html_message,
|
||||||
|
parse_markdown_message,
|
||||||
|
)
|
||||||
|
from telethon._impl.tl import types
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_leading_markdown() -> None:
|
||||||
|
markdown = "**Hello** world!"
|
||||||
|
text, entities = parse_markdown_message(markdown)
|
||||||
|
assert text == "Hello world!"
|
||||||
|
assert entities == [types.MessageEntityBold(offset=0, length=5)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_trailing_markdown() -> None:
|
||||||
|
markdown = "Hello **world!**"
|
||||||
|
text, entities = parse_markdown_message(markdown)
|
||||||
|
assert text == "Hello world!"
|
||||||
|
assert entities == [types.MessageEntityBold(offset=6, length=6)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_emoji_markdown() -> None:
|
||||||
|
markdown = "A **little 🦀** here"
|
||||||
|
text, entities = parse_markdown_message(markdown)
|
||||||
|
assert text == "A little 🦀 here"
|
||||||
|
assert entities == [types.MessageEntityBold(offset=2, length=9)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_all_entities_markdown() -> None:
|
||||||
|
markdown = "Some **bold** (__strong__), *italics* (_cursive_), inline `code`, a\n```rust\npre\n```\nblock, a [link](https://example.com), and [mentions](tg://user?id=12345678)"
|
||||||
|
text, entities = parse_markdown_message(markdown)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
text
|
||||||
|
== "Some bold (strong), italics (cursive), inline code, a\npre\nblock, a link, and mentions"
|
||||||
|
)
|
||||||
|
assert entities == [
|
||||||
|
types.MessageEntityBold(offset=5, length=4),
|
||||||
|
types.MessageEntityBold(offset=11, length=6),
|
||||||
|
types.MessageEntityItalic(offset=20, length=7),
|
||||||
|
types.MessageEntityItalic(offset=29, length=7),
|
||||||
|
types.MessageEntityCode(offset=46, length=4),
|
||||||
|
types.MessageEntityPre(offset=54, length=4, language="rust"),
|
||||||
|
types.MessageEntityTextUrl(offset=67, length=4, url="https://example.com"),
|
||||||
|
types.MessageEntityTextUrl(offset=77, length=8, url="tg://user?id=12345678"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_nested_entities_markdown() -> None:
|
||||||
|
# CommonMark won't allow the following="Some **bold _both** italics_"
|
||||||
|
markdown = "Some **bold _both_** _italics_"
|
||||||
|
text, entities = parse_markdown_message(markdown)
|
||||||
|
assert text == "Some bold both italics"
|
||||||
|
assert entities == [
|
||||||
|
types.MessageEntityBold(offset=5, length=9),
|
||||||
|
types.MessageEntityItalic(offset=10, length=4),
|
||||||
|
types.MessageEntityItalic(offset=15, length=7),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_then_unparse_markdown() -> None:
|
||||||
|
markdown = "Some **bold 🤷🏽♀️**, _italics_, inline `🤷🏽♀️ code`, a\n\n```rust\npre\n```\nblock, a [link](https://example.com), and [mentions](tg://user?id=12345678)"
|
||||||
|
text, entities = parse_markdown_message(markdown)
|
||||||
|
generated = generate_markdown_message(text, entities)
|
||||||
|
assert generated == markdown
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_leading_html() -> None:
|
||||||
|
# Intentionally use different casing to make sure that is handled well
|
||||||
|
html = "<B>Hello</b> world!"
|
||||||
|
text, entities = parse_html_message(html)
|
||||||
|
assert text == "Hello world!"
|
||||||
|
assert entities == [types.MessageEntityBold(offset=0, length=5)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_trailing_html() -> None:
|
||||||
|
html = "Hello <strong>world!</strong>"
|
||||||
|
text, entities = parse_html_message(html)
|
||||||
|
assert text == "Hello world!"
|
||||||
|
assert entities == [types.MessageEntityBold(offset=6, length=6)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_emoji_html() -> None:
|
||||||
|
html = "A <b>little 🦀</b> here"
|
||||||
|
text, entities = parse_html_message(html)
|
||||||
|
assert text == "A little 🦀 here"
|
||||||
|
assert entities == [types.MessageEntityBold(offset=2, length=9)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_all_entities_html() -> None:
|
||||||
|
html = 'Some <b>bold</b> (<strong>strong</strong>), <i>italics</i> (<em>cursive</em>), inline <code>code</code>, a <pre>pre</pre> block, a <a href="https://example.com">link</a>, <details>spoilers</details> and <a href="tg://user?id=12345678">mentions</a>'
|
||||||
|
text, entities = parse_html_message(html)
|
||||||
|
assert (
|
||||||
|
text
|
||||||
|
== "Some bold (strong), italics (cursive), inline code, a pre block, a link, spoilers and mentions"
|
||||||
|
)
|
||||||
|
assert entities == [
|
||||||
|
types.MessageEntityBold(offset=5, length=4),
|
||||||
|
types.MessageEntityBold(offset=11, length=6),
|
||||||
|
types.MessageEntityItalic(offset=20, length=7),
|
||||||
|
types.MessageEntityItalic(offset=29, length=7),
|
||||||
|
types.MessageEntityCode(offset=46, length=4),
|
||||||
|
types.MessageEntityPre(offset=54, length=3, language=""),
|
||||||
|
types.MessageEntityTextUrl(offset=67, length=4, url="https://example.com"),
|
||||||
|
types.MessageEntitySpoiler(offset=73, length=8),
|
||||||
|
types.MessageEntityTextUrl(offset=86, length=8, url="tg://user?id=12345678"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_pre_with_lang_html() -> None:
|
||||||
|
html = 'Some <pre>pre</pre>, <code>normal</code> and <pre><code class="language-rust">rusty</code></pre> code'
|
||||||
|
text, entities = parse_html_message(html)
|
||||||
|
assert text == "Some pre, normal and rusty code"
|
||||||
|
assert entities == [
|
||||||
|
types.MessageEntityPre(offset=5, length=3, language=""),
|
||||||
|
types.MessageEntityCode(offset=10, length=6),
|
||||||
|
types.MessageEntityPre(offset=21, length=5, language="rust"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_empty_pre_and_lang_html() -> None:
|
||||||
|
html = 'Some empty <pre></pre> and <code class="language-rust">code</code>'
|
||||||
|
text, entities = parse_html_message(html)
|
||||||
|
assert text == "Some empty and code"
|
||||||
|
assert entities == [types.MessageEntityCode(offset=16, length=4)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_link_no_href_html() -> None:
|
||||||
|
html = "Some <a>empty link</a>, it does nothing"
|
||||||
|
text, entities = parse_html_message(html)
|
||||||
|
assert text == "Some empty link, it does nothing"
|
||||||
|
assert entities == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_nested_entities_html() -> None:
|
||||||
|
html = "Some <b>bold <i>both</b> italics</i>"
|
||||||
|
text, entities = parse_html_message(html)
|
||||||
|
assert text == "Some bold both italics"
|
||||||
|
assert entities == [
|
||||||
|
types.MessageEntityBold(offset=5, length=9),
|
||||||
|
types.MessageEntityItalic(offset=10, length=12),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_then_unparse_html() -> None:
|
||||||
|
html = 'Some <strong>bold</strong>, <em>italics</em> inline <code>code</code>, a <pre>pre</pre> block <pre><code class="language-rs">use rust;</code></pre>, a <a href="https://example.com">link</a>, <details>spoilers</details> and <a href="tg://user?id=12345678">mentions</a>'
|
||||||
|
text, entities = parse_html_message(html)
|
||||||
|
generated = generate_html_message(text, entities)
|
||||||
|
assert generated == html
|
Loading…
Reference in New Issue
Block a user