Telethon/telethon/parser/markdown_parser.py

from ..tl.types import (MessageEntityBold, MessageEntityCode,
                        MessageEntityItalic, MessageEntityTextUrl)


def parse_message_entities(msg):
    """Parses a message and returns the parsed message and the entities (bold, italic...).
       Note that although markdown-like syntax is used, this does not reflect the complete specification!"""

    # Store the entities here
    entities = []

    # Convert the message to a mutable list
    msg = list(msg)

    # First, let's handle all the text links in the message, so afterwards it's clean
    # for us to get our hands dirty with the other indicators (bold, italic and fixed)
    url_indices = [None] * 4  # start/end text index, start/end url index
    valid_url_indices = []  # all the valid url_indices found
    for i, c in enumerate(msg):
        if c is '[':
            url_indices[0] = i

        # From now on, also ensure that the last item was set
        elif c == ']' and url_indices[0]:
            url_indices[1] = i

        elif c == '(' and url_indices[1]:
            # If the previous index (']') is not exactly before the current index ('('),
            # then it's not a valid text link, so clear the previous state
            if url_indices[1] != i - 1:
                url_indices[:2] = [None] * 2
            else:
                url_indices[2] = i

        elif c == ')' and url_indices[2]:
            # We have succeeded to find a markdown-like text link!
            url_indices[3] = i
            valid_url_indices.append(url_indices[:])  # Append a copy
            url_indices = [None] * 4

    # Iterate in reverse order to clean the text from the urls
    # (not to affect previous indices) and append MessageEntityTextUrl's
    for i in range(len(valid_url_indices) - 1, -1, -1):
        vui = valid_url_indices[i]

        # Add 1 when slicing the message not to include the [] nor ()
        # There is no need to subtract 1 on the later part because that index is already excluded
        link_text = ''.join(msg[vui[0] + 1:vui[1]])
        link_url = ''.join(msg[vui[2] + 1:vui[3]])

        # After we have retrieved both the link text and url, replace them in the message
        # Now we do have to add 1 to include the [] and () when deleting and replacing!
        del msg[vui[2]:vui[3] + 1]
        msg[vui[0]:vui[1] + 1] = link_text

        # Finally, update the current valid index url to reflect that all the previous VUI's will be removed
        # This is because, after the previous VUI's get done, their part of the message is removed too,
        # hence we need to update the current VUI subtracting that removed part length
        for prev_vui in valid_url_indices[:i]:
            prev_vui_length = prev_vui[3] - prev_vui[2] - 1
            displacement = prev_vui_length + len('[]()')
            vui[0] -= displacement
            vui[1] -= displacement
            # No need to subtract the displacement from the URL part (indices 2 and 3)

        # When calculating the length, subtract 1 again not to include the previously called ']'
        entities.append(
            MessageEntityTextUrl(
                offset=vui[0], length=vui[1] - vui[0] - 1, url=link_url))

    # After the message is clean from links, handle all the indicator flags
    indicator_flags = {'*': None, '_': None, '`': None}

    # Iterate over the list to find the indicators of entities
    for i, c in enumerate(msg):
        # Only perform further check if the current character is an indicator
        if c in indicator_flags:
            # If it is the first time we find this indicator, update its index
            if indicator_flags[c] is None:
                indicator_flags[c] = i

            # Otherwise, it means that we found it before. Hence, the message entity *is* complete
            else:
                # Then we have found a new whole valid entity
                offset = indicator_flags[c]
                length = i - offset - 1  # Subtract -1 not to include the indicator itself

                # Add the corresponding entity
                if c == '*':
                    entities.append(
                        MessageEntityBold(
                            offset=offset, length=length))

                elif c == '_':
                    entities.append(
                        MessageEntityItalic(
                            offset=offset, length=length))

                elif c == '`':
                    entities.append(
                        MessageEntityCode(
                            offset=offset, length=length))

                # Clear the flag to start over with this indicator
                indicator_flags[c] = None

    # Sort the entities by their offset first
    entities = sorted(entities, key=lambda e: e.offset)

    # Now that all the entities have been found and sorted, remove
    # their indicators from the message and update the offsets
    for entity in entities:
        if type(entity) is not MessageEntityTextUrl:
            # Clean the message from the current entity's indicators
            del msg[entity.offset + entity.length + 1]
            del msg[entity.offset]

            # Iterate over all the entities but the current
            for subentity in [e for e in entities if e is not entity]:
                # First case, one in one out: so*me_th_in*g.
                # In this case, the current entity length is decreased by two,
                # and all the subentities offset decreases 1
                if (subentity.offset > entity.offset and
                        subentity.offset + subentity.length <
                        entity.offset + entity.length):
                    entity.length -= 2
                    subentity.offset -= 1

                # Second case, both inside: so*me_th*in_g.
                # In this case, the current entity length is decreased by one,
                # and all the subentities offset and length decrease 1
                elif (entity.offset < subentity.offset < entity.offset +
                      entity.length < subentity.offset + subentity.length):
                    entity.length -= 1
                    subentity.offset -= 1
                    subentity.length -= 1

                # Third case, both outside: so*me*th_in_g.
                # In this case, the current entity is left untouched,
                # and all the subentities offset decreases 2
                elif subentity.offset > entity.offset + entity.length:
                    subentity.offset -= 2

    # Finally, we can join our poor mutilated message back and return
    msg = ''.join(msg)
    return msg, entities
Use relative imports always where possible 2017-05-21 14:02:54 +03:00			`from ..tl.types import (MessageEntityBold, MessageEntityCode,`
			`MessageEntityItalic, MessageEntityTextUrl)`
Added full* markdown support and updated README * Although the markdown parser works perfectly, the official Telegram client does not fully reflect it. However, if you still think that this is a lie, go check the markdown parser and test it yourself! 2016-09-07 20:01:00 +03:00

			`def parse_message_entities(msg):`
			`"""Parses a message and returns the parsed message and the entities (bold, italic...).`
			`Note that although markdown-like syntax is used, this does not reflect the complete specification!"""`

			`# Store the entities here`
			`entities = []`

			`# Convert the message to a mutable list`
			`msg = list(msg)`

			`# First, let's handle all the text links in the message, so afterwards it's clean`
			`# for us to get our hands dirty with the other indicators (bold, italic and fixed)`
			`url_indices = [None] * 4 # start/end text index, start/end url index`
			`valid_url_indices = [] # all the valid url_indices found`
			`for i, c in enumerate(msg):`
			`if c is '[':`
			`url_indices[0] = i`

			`# From now on, also ensure that the last item was set`
			`elif c == ']' and url_indices[0]:`
			`url_indices[1] = i`

			`elif c == '(' and url_indices[1]:`
			`# If the previous index (']') is not exactly before the current index ('('),`
			`# then it's not a valid text link, so clear the previous state`
			`if url_indices[1] != i - 1:`
			`url_indices[:2] = [None] * 2`
			`else:`
			`url_indices[2] = i`

			`elif c == ')' and url_indices[2]:`
			`# We have succeeded to find a markdown-like text link!`
			`url_indices[3] = i`
			`valid_url_indices.append(url_indices[:]) # Append a copy`
			`url_indices = [None] * 4`

			`# Iterate in reverse order to clean the text from the urls`
			`# (not to affect previous indices) and append MessageEntityTextUrl's`
			`for i in range(len(valid_url_indices) - 1, -1, -1):`
			`vui = valid_url_indices[i]`

			`# Add 1 when slicing the message not to include the [] nor ()`
			`# There is no need to subtract 1 on the later part because that index is already excluded`
Many code-style improvements 2016-11-30 00:29:42 +03:00			`link_text = ''.join(msg[vui[0] + 1:vui[1]])`
			`link_url = ''.join(msg[vui[2] + 1:vui[3]])`
Added full* markdown support and updated README * Although the markdown parser works perfectly, the official Telegram client does not fully reflect it. However, if you still think that this is a lie, go check the markdown parser and test it yourself! 2016-09-07 20:01:00 +03:00
			`# After we have retrieved both the link text and url, replace them in the message`
			`# Now we do have to add 1 to include the [] and () when deleting and replacing!`
Many code-style improvements 2016-11-30 00:29:42 +03:00			`del msg[vui[2]:vui[3] + 1]`
			`msg[vui[0]:vui[1] + 1] = link_text`
Added full* markdown support and updated README * Although the markdown parser works perfectly, the official Telegram client does not fully reflect it. However, if you still think that this is a lie, go check the markdown parser and test it yourself! 2016-09-07 20:01:00 +03:00
			`# Finally, update the current valid index url to reflect that all the previous VUI's will be removed`
			`# This is because, after the previous VUI's get done, their part of the message is removed too,`
			`# hence we need to update the current VUI subtracting that removed part length`
			`for prev_vui in valid_url_indices[:i]:`
			`prev_vui_length = prev_vui[3] - prev_vui[2] - 1`
			`displacement = prev_vui_length + len('[]()')`
			`vui[0] -= displacement`
			`vui[1] -= displacement`
			`# No need to subtract the displacement from the URL part (indices 2 and 3)`

			`# When calculating the length, subtract 1 again not to include the previously called ']'`
Many code-style improvements 2016-11-30 00:29:42 +03:00			`entities.append(`
			`MessageEntityTextUrl(`
			`offset=vui[0], length=vui[1] - vui[0] - 1, url=link_url))`
Added full* markdown support and updated README * Although the markdown parser works perfectly, the official Telegram client does not fully reflect it. However, if you still think that this is a lie, go check the markdown parser and test it yourself! 2016-09-07 20:01:00 +03:00
			`# After the message is clean from links, handle all the indicator flags`
Many code-style improvements 2016-11-30 00:29:42 +03:00			indicator_flags = {'*': None, '_': None, '`': None}
Added full* markdown support and updated README * Although the markdown parser works perfectly, the official Telegram client does not fully reflect it. However, if you still think that this is a lie, go check the markdown parser and test it yourself! 2016-09-07 20:01:00 +03:00
			`# Iterate over the list to find the indicators of entities`
			`for i, c in enumerate(msg):`
			`# Only perform further check if the current character is an indicator`
			`if c in indicator_flags:`
			`# If it is the first time we find this indicator, update its index`
			`if indicator_flags[c] is None:`
			`indicator_flags[c] = i`

			`# Otherwise, it means that we found it before. Hence, the message entity is complete`
			`else:`
			`# Then we have found a new whole valid entity`
			`offset = indicator_flags[c]`
			`length = i - offset - 1 # Subtract -1 not to include the indicator itself`

			`# Add the corresponding entity`
			`if c == '*':`
Many code-style improvements 2016-11-30 00:29:42 +03:00			`entities.append(`
			`MessageEntityBold(`
			`offset=offset, length=length))`
Added full* markdown support and updated README * Although the markdown parser works perfectly, the official Telegram client does not fully reflect it. However, if you still think that this is a lie, go check the markdown parser and test it yourself! 2016-09-07 20:01:00 +03:00
			`elif c == '_':`
Many code-style improvements 2016-11-30 00:29:42 +03:00			`entities.append(`
			`MessageEntityItalic(`
			`offset=offset, length=length))`
Added full* markdown support and updated README * Although the markdown parser works perfectly, the official Telegram client does not fully reflect it. However, if you still think that this is a lie, go check the markdown parser and test it yourself! 2016-09-07 20:01:00 +03:00
			elif c == '`':
Many code-style improvements 2016-11-30 00:29:42 +03:00			`entities.append(`
			`MessageEntityCode(`
			`offset=offset, length=length))`
Added full* markdown support and updated README * Although the markdown parser works perfectly, the official Telegram client does not fully reflect it. However, if you still think that this is a lie, go check the markdown parser and test it yourself! 2016-09-07 20:01:00 +03:00
			`# Clear the flag to start over with this indicator`
			`indicator_flags[c] = None`

			`# Sort the entities by their offset first`
			`entities = sorted(entities, key=lambda e: e.offset)`

			`# Now that all the entities have been found and sorted, remove`
			`# their indicators from the message and update the offsets`
			`for entity in entities:`
			`if type(entity) is not MessageEntityTextUrl:`
			`# Clean the message from the current entity's indicators`
			`del msg[entity.offset + entity.length + 1]`
			`del msg[entity.offset]`

			`# Iterate over all the entities but the current`
			`for subentity in [e for e in entities if e is not entity]:`
			`# First case, one in one out: some_th_ing.`
			`# In this case, the current entity length is decreased by two,`
			`# and all the subentities offset decreases 1`
			`if (subentity.offset > entity.offset and`
Many code-style improvements 2016-11-30 00:29:42 +03:00			`subentity.offset + subentity.length <`
			`entity.offset + entity.length):`
Added full* markdown support and updated README * Although the markdown parser works perfectly, the official Telegram client does not fully reflect it. However, if you still think that this is a lie, go check the markdown parser and test it yourself! 2016-09-07 20:01:00 +03:00			`entity.length -= 2`
			`subentity.offset -= 1`

			`# Second case, both inside: some_thin_g.`
			`# In this case, the current entity length is decreased by one,`
			`# and all the subentities offset and length decrease 1`
Many code-style improvements 2016-11-30 00:29:42 +03:00			`elif (entity.offset < subentity.offset < entity.offset +`
			`entity.length < subentity.offset + subentity.length):`
Added full* markdown support and updated README * Although the markdown parser works perfectly, the official Telegram client does not fully reflect it. However, if you still think that this is a lie, go check the markdown parser and test it yourself! 2016-09-07 20:01:00 +03:00			`entity.length -= 1`
			`subentity.offset -= 1`
			`subentity.length -= 1`

			`# Third case, both outside: someth_in_g.`
			`# In this case, the current entity is left untouched,`
			`# and all the subentities offset decreases 2`
			`elif subentity.offset > entity.offset + entity.length:`
			`subentity.offset -= 2`

			`# Finally, we can join our poor mutilated message back and return`
			`msg = ''.join(msg)`
			`return msg, entities`