Work on byte level when parsing markdown

Reasoning: instead encoding every character one by one as we
encounter them to use half their length as the correct offset,
we can simply encode the whole string at once as utf-16le and
work with that directly.
This commit is contained in:
Lonami Exo 2017-11-06 10:29:32 +01:00
parent e8248b4b8b
commit 4f80429215

View File

@ -11,8 +11,6 @@ from ..tl.types import (
MessageEntityPre, MessageEntityTextUrl MessageEntityPre, MessageEntityTextUrl
) )
def tg_string_len(s):
return len(s.encode('utf-16le')) // 2
class Mode(Enum): class Mode(Enum):
"""Different modes supported by Telegram's Markdown""" """Different modes supported by Telegram's Markdown"""
@ -31,7 +29,10 @@ DEFAULT_DELIMITERS = {
'```': Mode.PRE '```': Mode.PRE
} }
DEFAULT_URL_RE = re.compile(r'\[(.+?)\]\((.+?)\)') # Regex used to match utf-16le encoded r'\[(.+?)\]\((.+?)\)',
# reason why there's '\0' after every match-literal character.
DEFAULT_URL_RE = re.compile(b'\\[\0(.+)\\]\0\\(\0(.+?)\\)\0')
def parse(message, delimiters=None, url_re=None): def parse(message, delimiters=None, url_re=None):
""" """
@ -40,40 +41,45 @@ def parse(message, delimiters=None, url_re=None):
dictionary (or default if None). dictionary (or default if None).
The url_re(gex) must contain two matching groups: the text to be The url_re(gex) must contain two matching groups: the text to be
clickable and the URL itself. clickable and the URL itself, and be utf-16le encoded.
""" """
# Work on byte level with the utf-16le encoding to get the offsets right.
# The offset will just be half the index we're at.
if url_re is None: if url_re is None:
url_re = DEFAULT_URL_RE url_re = DEFAULT_URL_RE
elif url_re: elif url_re:
if isinstance(url_re, str): if isinstance(url_re, str):
url_re = re.compile(url_re) url_re = re.compile(url_re.encode('utf-16le'))
if not delimiters: if not delimiters:
if delimiters is not None: if delimiters is not None:
return message, [] return message, []
delimiters = DEFAULT_DELIMITERS delimiters = DEFAULT_DELIMITERS
delimiters = {k.encode('utf-16le'): v for k, v in delimiters.items()}
i = 0
result = [] result = []
current = Mode.NONE current = Mode.NONE
offset = 0 message = message.encode('utf-16le')
i = 0
while i < len(message): while i < len(message):
url_match = None url_match = None
if url_re and current == Mode.NONE: if url_re and current == Mode.NONE:
url_match = url_re.match(message, pos=i) url_match = url_re.match(message, pos=i)
if url_match: if url_match:
message = ''.join(( message = b''.join((
message[:url_match.start()], message[:url_match.start()],
url_match.group(1), url_match.group(1),
message[url_match.end():] message[url_match.end():]
)) ))
result.append(( result.append((
offset, i // 2,
offset + tg_string_len(url_match.group(1)), (i + len(url_match.group(1))) // 2,
(Mode.URL, url_match.group(2)) (Mode.URL, url_match.group(2).decode('utf-16le'))
)) ))
i += len(url_match.group(1)) i += len(url_match.group(1))
if not url_match: if not url_match:
for d, m in delimiters.items(): for d, m in delimiters.items():
if message[i:i + len(d)] == d and current in (Mode.NONE, m): if message[i:i + len(d)] == d and current in (Mode.NONE, m):
@ -82,21 +88,20 @@ def parse(message, delimiters=None, url_re=None):
message = message[:i] + message[i + len(d):] message = message[:i] + message[i + len(d):]
if current == Mode.NONE: if current == Mode.NONE:
result.append(offset) result.append(i // 2)
current = m current = m
else: else:
result[-1] = (result[-1], offset, current) result[-1] = (result[-1], i // 2, current)
current = Mode.NONE current = Mode.NONE
break break
if i < len(message): if i < len(message):
offset += tg_string_len(message[i]) i += 2
i += 1
if result and not isinstance(result[-1], tuple): if result and not isinstance(result[-1], tuple):
result.pop() result.pop()
return message, result return message.decode('utf-16le'), result
def parse_tg(message, delimiters=None): def parse_tg(message, delimiters=None):