From 59a1a6aef22c67947489266bdf56609caf8196a7 Mon Sep 17 00:00:00 2001 From: Lonami Exo Date: Sun, 7 Jan 2018 16:18:54 +0100 Subject: [PATCH] Stop working with bytes on the markdown parser --- telethon/extensions/markdown.py | 77 ++++++++++++++++----------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/telethon/extensions/markdown.py b/telethon/extensions/markdown.py index 6285bf28..10327c46 100644 --- a/telethon/extensions/markdown.py +++ b/telethon/extensions/markdown.py @@ -4,6 +4,7 @@ for use within the library, which attempts to handle emojies correctly, since they seem to count as two characters and it's a bit strange. """ import re +import struct from ..tl import TLObject @@ -20,15 +21,24 @@ DEFAULT_DELIMITERS = { '```': MessageEntityPre } -# Regex used to match utf-16le encoded r'\[(.+?)\]\((.+?)\)', -# reason why there's '\0' after every match-literal character. -DEFAULT_URL_RE = re.compile(b'\\[\0(.+?)\\]\0\\(\0(.+?)\\)\0') +# Regex used to match r'\[(.+?)\]\((.+?)\)' (for URLs. +DEFAULT_URL_RE = re.compile(r'\[(.+?)\]\((.+?)\)') # Reverse operation for DEFAULT_URL_RE. {0} for text, {1} for URL. DEFAULT_URL_FORMAT = '[{0}]({1})' -# Encoding to be used -ENC = 'utf-16le' + +def _add_surrogate(text): + return ''.join( + # SMP -> Surrogate Pairs (Telegram offsets are calculated with these). + # See https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview for more. + ''.join(chr(y) for y in struct.unpack('