Add and document 'parse_lang' function for language detection

- Implemented the `parse_lang` function to extract the language of a code block from a message. - This feature aids in identifying and displaying code blocks in Telegram using markdown v2 syntax. - The function takes a string and the start/end indices of a code block, returning the detected language. - Integrated `parse_lang` into the `parse` function for enhanced message processing. - Supports language-specific formatting in Telegram messages.
2025-09-19 02:12:40 +03:00 · 2023-11-21 18:06:35 +01:00 · 2023-11-21 18:06:35 +01:00 · cbcc50948f
commit cbcc50948f
parent cf3bc71e1d
1 changed files with 33 additions and 1 deletions
--- a/telethon/extensions/markdown.py
+++ b/telethon/extensions/markdown.py
@ -30,6 +30,37 @@ def overlap(a, b, x, y):
    return max(a, x) < min(b, y)


+def parse_lang(text: str, i: int, end: int) -> tuple[str, str, int]:
+    """
+    Extracts the language of a code block from a message.
+    :param text: the message to extract the language from.
+    :param i: the start of the code block.
+    :param end: the end of the code block.
+    :return: a tuple consisting of (language, text, end).
+    """
+    # Default language values
+    lang = ''
+
+    # Find first newline after delimiter
+    start_code = text.find('\n', i)
+
+    # If no newline is found, assume that no language is specified
+    if not start_code == -1:
+        unvalidated_lang = text[i:start_code].strip()
+
+        # Validate language against regex; return default if not matching
+        if re.compile(r'[a-zA-Z0-9_-]{1,32}'
+                      ).fullmatch(unvalidated_lang) is not None:
+            lang = unvalidated_lang
+
+            # Remove lang and extra newline from the text; update 'end'
+            code_block = text[start_code:end].strip()
+            text = text[:i] + code_block + text[end:]
+            end = i + len(code_block)
+
+    return lang, text, end
+
+
 def parse(message, delimiters=None, url_re=None):
    """
    Parses the given markdown message and returns its stripped representation
@ -99,7 +130,8 @@ def parse(message, delimiters=None, url_re=None):
                # Append the found entity
                ent = delimiters[delim]
                if ent == MessageEntityPre:
-                    result.append(ent(i, end - i - len(delim), ''))  # has 'lang'
+                    lang, message, end = parse_lang(message, i, end)
+                    result.append(ent(i, end - i - len(delim), lang))
                else:
                    result.append(ent(i, end - i - len(delim)))