Telethon/telethon_generator/parsers/tlobject/parser.py

import collections
import re

from .tlarg import TLArg
from .tlobject import TLObject
from ..methods import Usability


CORE_TYPES = {
    0xbc799737,  # boolFalse#bc799737 = Bool;
    0x997275b5,  # boolTrue#997275b5 = Bool;
    0x3fedd339,  # true#3fedd339 = True;
    0xc4b9f9bb,  # error#c4b9f9bb code:int text:string = Error;
    0x56730bcc   # null#56730bcc = Null;
}

# Telegram Desktop (C++) doesn't care about string/bytes, and the .tl files
# don't either. However in Python we *do*, and we want to deal with bytes
# for the authorization key process, not UTF-8 strings (they won't be).
#
# Every type with an ID that's in here should get their attribute types
# with string being replaced with bytes.
AUTH_KEY_TYPES = {
    0x05162463,  # resPQ,
    0x83c95aec,  # p_q_inner_data
    0xa9f55f95,  # p_q_inner_data_dc
    0x3c6a84d4,  # p_q_inner_data_temp
    0x56fddf88,  # p_q_inner_data_temp_dc
    0xd0e8075c,  # server_DH_params_ok
    0xb5890dba,  # server_DH_inner_data
    0x6643b654,  # client_DH_inner_data
    0xd712e4be,  # req_DH_params
    0xf5045f1f,  # set_client_DH_params
    0x3072cfa1   # gzip_packed
}


def _from_line(line, is_function, method_info, layer):
    match = re.match(
        r'^([\w.]+)'                     # 'name'
        r'(?:#([0-9a-fA-F]+))?'          # '#optionalcode'
        r'(?:\s{?\w+:[\w\d<>#.?!]+}?)*'  # '{args:.0?type}'
        r'\s=\s'                         # ' = '
        r'([\w\d<>#.?]+);$',             # '<result.type>;'
        line
    )
    if match is None:
        # Probably "vector#1cb5c415 {t:Type} # [ t ] = Vector t;"
        raise ValueError('Cannot parse TLObject {}'.format(line))

    args_match = re.findall(
        r'({)?'
        r'(\w+)'
        r':'
        r'([\w\d<>#.?!]+)'
        r'}?',
        line
    )

    name = match.group(1)
    if name in method_info:
        usability = method_info[name].usability
    else:
        usability = Usability.UNKNOWN

    return TLObject(
        fullname=name,
        object_id=match.group(2),
        result=match.group(3),
        is_function=is_function,
        layer=layer,
        usability=usability,
        args=[TLArg(name, arg_type, brace != '')
              for brace, name, arg_type in args_match]
    )


def parse_tl(file_path, layer, methods=None, ignored_ids=CORE_TYPES):
    """
    This method yields TLObjects from a given .tl file.

    Note that the file is parsed completely before the function yields
    because references to other objects may appear later in the file.
    """
    method_info = {m.name: m for m in (methods or [])}
    obj_all = []
    obj_by_name = {}
    obj_by_type = collections.defaultdict(list)
    with file_path.open() as file:
        is_function = False
        for line in file:
            comment_index = line.find('//')
            if comment_index != -1:
                line = line[:comment_index]

            line = line.strip()
            if not line:
                continue

            match = re.match('---(\w+)---', line)
            if match:
                following_types = match.group(1)
                is_function = following_types == 'functions'
                continue

            try:
                result = _from_line(
                    line, is_function, method_info, layer=layer)

                if result.id in ignored_ids:
                    continue

                obj_all.append(result)
                if not result.is_function:
                    obj_by_name[result.fullname] = result
                    obj_by_type[result.result].append(result)
            except ValueError as e:
                if 'vector#1cb5c415' not in str(e):
                    raise

    # Once all objects have been parsed, replace the
    # string type from the arguments with references
    for obj in obj_all:
        if obj.id in AUTH_KEY_TYPES:
            for arg in obj.args:
                if arg.type == 'string':
                    arg.type = 'bytes'

        for arg in obj.args:
            arg.cls = obj_by_type.get(arg.type) or (
                [obj_by_name[arg.type]] if arg.type in obj_by_name else []
            )

    yield from obj_all


def find_layer(file_path):
    """Finds the layer used on the specified scheme.tl file."""
    layer_regex = re.compile(r'^//\s*LAYER\s*(\d+)$')
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            match = layer_regex.match(line)
            if match:
                return int(match.group(1))
Split generator/tlobject into separate files 2018-10-15 20:29:32 +03:00			`import collections`
			`import re`

			`from .tlarg import TLArg`
			`from .tlobject import TLObject`
Clarify who can use methods in the documentation 2018-12-03 18:17:37 +03:00			`from ..methods import Usability`
Split generator/tlobject into separate files 2018-10-15 20:29:32 +03:00

Skip core types when parsing .tl files This avoids the need to comment then every time the file changes. 2018-12-15 14:38:56 +03:00			`CORE_TYPES = {`
			`0xbc799737, # boolFalse#bc799737 = Bool;`
			`0x997275b5, # boolTrue#997275b5 = Bool;`
			`0x3fedd339, # true#3fedd339 = True;`
			`0xc4b9f9bb, # error#c4b9f9bb code:int text:string = Error;`
			`0x56730bcc # null#56730bcc = Null;`
			`}`

Prevent common pitfall when pulling new .tl files 2018-12-20 22:51:15 +03:00			`# Telegram Desktop (C++) doesn't care about string/bytes, and the .tl files`
			`# don't either. However in Python we do, and we want to deal with bytes`
			`# for the authorization key process, not UTF-8 strings (they won't be).`
			`#`
			`# Every type with an ID that's in here should get their attribute types`
			`# with string being replaced with bytes.`
			`AUTH_KEY_TYPES = {`
			`0x05162463, # resPQ,`
			`0x83c95aec, # p_q_inner_data`
			`0xa9f55f95, # p_q_inner_data_dc`
			`0x3c6a84d4, # p_q_inner_data_temp`
			`0x56fddf88, # p_q_inner_data_temp_dc`
			`0xd0e8075c, # server_DH_params_ok`
			`0xb5890dba, # server_DH_inner_data`
			`0x6643b654, # client_DH_inner_data`
			`0xd712e4be, # req_DH_params`
			`0xf5045f1f, # set_client_DH_params`
			`0x3072cfa1 # gzip_packed`
			`}`

Skip core types when parsing .tl files This avoids the need to comment then every time the file changes. 2018-12-15 14:38:56 +03:00
Clarify who can use methods in the documentation 2018-12-03 18:17:37 +03:00			`def _from_line(line, is_function, method_info, layer):`
Split generator/tlobject into separate files 2018-10-15 20:29:32 +03:00			`match = re.match(`
			`r'^([\w.]+)' # 'name'`
			`r'(?:#([0-9a-fA-F]+))?' # '#optionalcode'`
			`r'(?:\s{?\w+:[\w\d<>#.?!]+}?)*' # '{args:.0?type}'`
			`r'\s=\s' # ' = '`
			`r'([\w\d<>#.?]+);$', # '<result.type>;'`
			`line`
			`)`
			`if match is None:`
			`# Probably "vector#1cb5c415 {t:Type} # [ t ] = Vector t;"`
			`raise ValueError('Cannot parse TLObject {}'.format(line))`

			`args_match = re.findall(`
			`r'({)?'`
			`r'(\w+)'`
			`r':'`
			`r'([\w\d<>#.?!]+)'`
			`r'}?',`
			`line`
			`)`
Clarify who can use methods in the documentation 2018-12-03 18:17:37 +03:00
			`name = match.group(1)`
			`if name in method_info:`
			`usability = method_info[name].usability`
			`else:`
			`usability = Usability.UNKNOWN`

Split generator/tlobject into separate files 2018-10-15 20:29:32 +03:00			`return TLObject(`
Clarify who can use methods in the documentation 2018-12-03 18:17:37 +03:00			`fullname=name,`
Split generator/tlobject into separate files 2018-10-15 20:29:32 +03:00			`object_id=match.group(2),`
			`result=match.group(3),`
			`is_function=is_function,`
			`layer=layer,`
Clarify who can use methods in the documentation 2018-12-03 18:17:37 +03:00			`usability=usability,`
Split generator/tlobject into separate files 2018-10-15 20:29:32 +03:00			`args=[TLArg(name, arg_type, brace != '')`
			`for brace, name, arg_type in args_match]`
			`)`


Skip core types when parsing .tl files This avoids the need to comment then every time the file changes. 2018-12-15 14:38:56 +03:00			`def parse_tl(file_path, layer, methods=None, ignored_ids=CORE_TYPES):`
Split generator/tlobject into separate files 2018-10-15 20:29:32 +03:00			`"""`
			`This method yields TLObjects from a given .tl file.`

			`Note that the file is parsed completely before the function yields`
			`because references to other objects may appear later in the file.`
			`"""`
Clarify who can use methods in the documentation 2018-12-03 18:17:37 +03:00			`method_info = {m.name: m for m in (methods or [])}`
Split generator/tlobject into separate files 2018-10-15 20:29:32 +03:00			`obj_all = []`
			`obj_by_name = {}`
			`obj_by_type = collections.defaultdict(list)`
Make use of pathlib nearly everywhere (breaks docs gen) Python 3.6 introduced support for the os.PathLike interface, which means Python 3.5 did not have it yet and attempting to use it in os functions would fail. Instead we can use pathlib for everything, but not all work is done yet. 2018-12-21 15:24:16 +03:00			`with file_path.open() as file:`
Split generator/tlobject into separate files 2018-10-15 20:29:32 +03:00			`is_function = False`
			`for line in file:`
			`comment_index = line.find('//')`
			`if comment_index != -1:`
			`line = line[:comment_index]`

			`line = line.strip()`
			`if not line:`
			`continue`

			`match = re.match('---(\w+)---', line)`
			`if match:`
			`following_types = match.group(1)`
			`is_function = following_types == 'functions'`
			`continue`

			`try:`
Clarify who can use methods in the documentation 2018-12-03 18:17:37 +03:00			`result = _from_line(`
			`line, is_function, method_info, layer=layer)`

Skip core types when parsing .tl files This avoids the need to comment then every time the file changes. 2018-12-15 14:38:56 +03:00			`if result.id in ignored_ids:`
			`continue`

Split generator/tlobject into separate files 2018-10-15 20:29:32 +03:00			`obj_all.append(result)`
Don't add functions as possible argument types 2018-10-15 22:12:10 +03:00			`if not result.is_function:`
			`obj_by_name[result.fullname] = result`
			`obj_by_type[result.result].append(result)`
Split generator/tlobject into separate files 2018-10-15 20:29:32 +03:00			`except ValueError as e:`
			`if 'vector#1cb5c415' not in str(e):`
			`raise`

			`# Once all objects have been parsed, replace the`
			`# string type from the arguments with references`
			`for obj in obj_all:`
Prevent common pitfall when pulling new .tl files 2018-12-20 22:51:15 +03:00			`if obj.id in AUTH_KEY_TYPES:`
			`for arg in obj.args:`
			`if arg.type == 'string':`
			`arg.type = 'bytes'`

Split generator/tlobject into separate files 2018-10-15 20:29:32 +03:00			`for arg in obj.args:`
			`arg.cls = obj_by_type.get(arg.type) or (`
			`[obj_by_name[arg.type]] if arg.type in obj_by_name else []`
			`)`

			`yield from obj_all`


			`def find_layer(file_path):`
			`"""Finds the layer used on the specified scheme.tl file."""`
			`layer_regex = re.compile(r'^//\sLAYER\s(\d+)$')`
			`with open(file_path, 'r', encoding='utf-8') as file:`
			`for line in file:`
			`match = layer_regex.match(line)`
			`if match:`
			`return int(match.group(1))`