Telethon/parser/markdown_parser.py

from tl.types import MessageEntityBold, MessageEntityItalic, MessageEntityCode, MessageEntityTextUrl


def parse_message_entities(msg):
    """Parses a message and returns the parsed message and the entities (bold, italic...).
       Note that although markdown-like syntax is used, this does not reflect the complete specification!"""

    # Store the entities here
    entities = []

    # Convert the message to a mutable list
    msg = list(msg)

    # First, let's handle all the text links in the message, so afterwards it's clean
    # for us to get our hands dirty with the other indicators (bold, italic and fixed)
    url_indices = [None] * 4  # start/end text index, start/end url index
    valid_url_indices = []  # all the valid url_indices found
    for i, c in enumerate(msg):
        if c is '[':
            url_indices[0] = i

        # From now on, also ensure that the last item was set
        elif c == ']' and url_indices[0]:
            url_indices[1] = i

        elif c == '(' and url_indices[1]:
            # If the previous index (']') is not exactly before the current index ('('),
            # then it's not a valid text link, so clear the previous state
            if url_indices[1] != i - 1:
                url_indices[:2] = [None] * 2
            else:
                url_indices[2] = i

        elif c == ')' and url_indices[2]:
            # We have succeeded to find a markdown-like text link!
            url_indices[3] = i
            valid_url_indices.append(url_indices[:])  # Append a copy
            url_indices = [None] * 4

    # Iterate in reverse order to clean the text from the urls
    # (not to affect previous indices) and append MessageEntityTextUrl's
    for i in range(len(valid_url_indices) - 1, -1, -1):
        vui = valid_url_indices[i]

        # Add 1 when slicing the message not to include the [] nor ()
        # There is no need to subtract 1 on the later part because that index is already excluded
        link_text = ''.join(msg[vui[0]+1:vui[1]])
        link_url = ''.join(msg[vui[2]+1:vui[3]])

        # After we have retrieved both the link text and url, replace them in the message
        # Now we do have to add 1 to include the [] and () when deleting and replacing!
        del msg[vui[2]:vui[3]+1]
        msg[vui[0]:vui[1]+1] = link_text

        # Finally, update the current valid index url to reflect that all the previous VUI's will be removed
        # This is because, after the previous VUI's get done, their part of the message is removed too,
        # hence we need to update the current VUI subtracting that removed part length
        for prev_vui in valid_url_indices[:i]:
            prev_vui_length = prev_vui[3] - prev_vui[2] - 1
            displacement = prev_vui_length + len('[]()')
            vui[0] -= displacement
            vui[1] -= displacement
            # No need to subtract the displacement from the URL part (indices 2 and 3)

        # When calculating the length, subtract 1 again not to include the previously called ']'
        entities.append(MessageEntityTextUrl(offset=vui[0], length=vui[1] - vui[0] - 1, url=link_url))

    # After the message is clean from links, handle all the indicator flags
    indicator_flags = {
        '*': None,
        '_': None,
        '`': None
    }

    # Iterate over the list to find the indicators of entities
    for i, c in enumerate(msg):
        # Only perform further check if the current character is an indicator
        if c in indicator_flags:
            # If it is the first time we find this indicator, update its index
            if indicator_flags[c] is None:
                indicator_flags[c] = i

            # Otherwise, it means that we found it before. Hence, the message entity *is* complete
            else:
                # Then we have found a new whole valid entity
                offset = indicator_flags[c]
                length = i - offset - 1  # Subtract -1 not to include the indicator itself

                # Add the corresponding entity
                if c == '*':
                    entities.append(MessageEntityBold(offset=offset, length=length))

                elif c == '_':
                    entities.append(MessageEntityItalic(offset=offset, length=length))

                elif c == '`':
                    entities.append(MessageEntityCode(offset=offset, length=length))

                # Clear the flag to start over with this indicator
                indicator_flags[c] = None

    # Sort the entities by their offset first
    entities = sorted(entities, key=lambda e: e.offset)

    # Now that all the entities have been found and sorted, remove
    # their indicators from the message and update the offsets
    for entity in entities:
        if type(entity) is not MessageEntityTextUrl:
            # Clean the message from the current entity's indicators
            del msg[entity.offset + entity.length + 1]
            del msg[entity.offset]

            # Iterate over all the entities but the current
            for subentity in [e for e in entities if e is not entity]:
                # First case, one in one out: so*me_th_in*g.
                # In this case, the current entity length is decreased by two,
                # and all the subentities offset decreases 1
                if (subentity.offset > entity.offset and
                        subentity.offset + subentity.length < entity.offset + entity.length):
                    entity.length -= 2
                    subentity.offset -= 1

                # Second case, both inside: so*me_th*in_g.
                # In this case, the current entity length is decreased by one,
                # and all the subentities offset and length decrease 1
                elif (entity.offset < subentity.offset < entity.offset + entity.length and
                      subentity.offset + subentity.length > entity.offset + entity.length):
                    entity.length -= 1
                    subentity.offset -= 1
                    subentity.length -= 1

                # Third case, both outside: so*me*th_in_g.
                # In this case, the current entity is left untouched,
                # and all the subentities offset decreases 2
                elif subentity.offset > entity.offset + entity.length:
                    subentity.offset -= 2

    # Finally, we can join our poor mutilated message back and return
    msg = ''.join(msg)
    return msg, entities
Added full* markdown support and updated README * Although the markdown parser works perfectly, the official Telegram client does not fully reflect it. However, if you still think that this is a lie, go check the markdown parser and test it yourself! 2016-09-07 20:01:00 +03:00			`from tl.types import MessageEntityBold, MessageEntityItalic, MessageEntityCode, MessageEntityTextUrl`


			`def parse_message_entities(msg):`
			`"""Parses a message and returns the parsed message and the entities (bold, italic...).`
			`Note that although markdown-like syntax is used, this does not reflect the complete specification!"""`

			`# Store the entities here`
			`entities = []`

			`# Convert the message to a mutable list`
			`msg = list(msg)`

			`# First, let's handle all the text links in the message, so afterwards it's clean`
			`# for us to get our hands dirty with the other indicators (bold, italic and fixed)`
			`url_indices = [None] * 4 # start/end text index, start/end url index`
			`valid_url_indices = [] # all the valid url_indices found`
			`for i, c in enumerate(msg):`
			`if c is '[':`
			`url_indices[0] = i`

			`# From now on, also ensure that the last item was set`
			`elif c == ']' and url_indices[0]:`
			`url_indices[1] = i`

			`elif c == '(' and url_indices[1]:`
			`# If the previous index (']') is not exactly before the current index ('('),`
			`# then it's not a valid text link, so clear the previous state`
			`if url_indices[1] != i - 1:`
			`url_indices[:2] = [None] * 2`
			`else:`
			`url_indices[2] = i`

			`elif c == ')' and url_indices[2]:`
			`# We have succeeded to find a markdown-like text link!`
			`url_indices[3] = i`
			`valid_url_indices.append(url_indices[:]) # Append a copy`
			`url_indices = [None] * 4`

			`# Iterate in reverse order to clean the text from the urls`
			`# (not to affect previous indices) and append MessageEntityTextUrl's`
			`for i in range(len(valid_url_indices) - 1, -1, -1):`
			`vui = valid_url_indices[i]`

			`# Add 1 when slicing the message not to include the [] nor ()`
			`# There is no need to subtract 1 on the later part because that index is already excluded`
			`link_text = ''.join(msg[vui[0]+1:vui[1]])`
			`link_url = ''.join(msg[vui[2]+1:vui[3]])`

			`# After we have retrieved both the link text and url, replace them in the message`
			`# Now we do have to add 1 to include the [] and () when deleting and replacing!`
			`del msg[vui[2]:vui[3]+1]`
			`msg[vui[0]:vui[1]+1] = link_text`

			`# Finally, update the current valid index url to reflect that all the previous VUI's will be removed`
			`# This is because, after the previous VUI's get done, their part of the message is removed too,`
			`# hence we need to update the current VUI subtracting that removed part length`
			`for prev_vui in valid_url_indices[:i]:`
			`prev_vui_length = prev_vui[3] - prev_vui[2] - 1`
			`displacement = prev_vui_length + len('[]()')`
			`vui[0] -= displacement`
			`vui[1] -= displacement`
			`# No need to subtract the displacement from the URL part (indices 2 and 3)`

			`# When calculating the length, subtract 1 again not to include the previously called ']'`
			`entities.append(MessageEntityTextUrl(offset=vui[0], length=vui[1] - vui[0] - 1, url=link_url))`

			`# After the message is clean from links, handle all the indicator flags`
			`indicator_flags = {`
			`'*': None,`
			`'_': None,`
			'`': None
			`}`

			`# Iterate over the list to find the indicators of entities`
			`for i, c in enumerate(msg):`
			`# Only perform further check if the current character is an indicator`
			`if c in indicator_flags:`
			`# If it is the first time we find this indicator, update its index`
			`if indicator_flags[c] is None:`
			`indicator_flags[c] = i`

			`# Otherwise, it means that we found it before. Hence, the message entity is complete`
			`else:`
			`# Then we have found a new whole valid entity`
			`offset = indicator_flags[c]`
			`length = i - offset - 1 # Subtract -1 not to include the indicator itself`

			`# Add the corresponding entity`
			`if c == '*':`
			`entities.append(MessageEntityBold(offset=offset, length=length))`

			`elif c == '_':`
			`entities.append(MessageEntityItalic(offset=offset, length=length))`

			elif c == '`':
			`entities.append(MessageEntityCode(offset=offset, length=length))`

			`# Clear the flag to start over with this indicator`
			`indicator_flags[c] = None`

			`# Sort the entities by their offset first`
			`entities = sorted(entities, key=lambda e: e.offset)`

			`# Now that all the entities have been found and sorted, remove`
			`# their indicators from the message and update the offsets`
			`for entity in entities:`
			`if type(entity) is not MessageEntityTextUrl:`
			`# Clean the message from the current entity's indicators`
			`del msg[entity.offset + entity.length + 1]`
			`del msg[entity.offset]`

			`# Iterate over all the entities but the current`
			`for subentity in [e for e in entities if e is not entity]:`
			`# First case, one in one out: some_th_ing.`
			`# In this case, the current entity length is decreased by two,`
			`# and all the subentities offset decreases 1`
			`if (subentity.offset > entity.offset and`
			`subentity.offset + subentity.length < entity.offset + entity.length):`
			`entity.length -= 2`
			`subentity.offset -= 1`

			`# Second case, both inside: some_thin_g.`
			`# In this case, the current entity length is decreased by one,`
			`# and all the subentities offset and length decrease 1`
Minor improvement to updates handling Now the updates thread won't start unless you add, at least, one updates handler. Also, if the TcpClient was receiving (i.e., from an update), it will let the update to be received first instead of crashing 2016-09-11 12:50:38 +03:00			`elif (entity.offset < subentity.offset < entity.offset + entity.length and`
			`subentity.offset + subentity.length > entity.offset + entity.length):`
Added full* markdown support and updated README * Although the markdown parser works perfectly, the official Telegram client does not fully reflect it. However, if you still think that this is a lie, go check the markdown parser and test it yourself! 2016-09-07 20:01:00 +03:00			`entity.length -= 1`
			`subentity.offset -= 1`
			`subentity.length -= 1`

			`# Third case, both outside: someth_in_g.`
			`# In this case, the current entity is left untouched,`
			`# and all the subentities offset decreases 2`
			`elif subentity.offset > entity.offset + entity.length:`
			`subentity.offset -= 2`

			`# Finally, we can join our poor mutilated message back and return`
			`msg = ''.join(msg)`
			`return msg, entities`