MultiModelConverter now processes sub_models in parallel threads

2025-09-17 09:22:31 +03:00 · 2019-01-21 14:32:12 +05:00 · 2019-01-21 14:32:12 +05:00 · 69c2c5504b
commit 69c2c5504b
parent 480a857fe8
2 changed files with 22 additions and 17 deletions
--- a/src/django_clickhouse/clickhouse_models.py
+++ b/src/django_clickhouse/clickhouse_models.py
@ -19,7 +19,7 @@ from .exceptions import RedisLockTimeoutError
 from .models import ClickHouseSyncModel
 from .query import QuerySet
 from .serializers import Django2ClickHouseModelSerializer
-from .utils import lazy_class_import, exec_multi_db_func
+from .utils import lazy_class_import, exec_multi_arg_func, exec_in_parallel


 class ClickHouseModelMeta(InfiModelBase):
@ -160,7 +160,7 @@ class ClickHouseModel(with_metaclass(ClickHouseModelMeta, InfiModel)):
            pk_by_db[using].add(pk)

        # Selecting data from multiple databases should work faster in parallel, if connections are independent.
-        objs = exec_multi_db_func(
+        objs = exec_multi_arg_func(
            lambda db_alias: cls.get_sync_query_set(db_alias, pk_by_db[db_alias]),
            pk_by_db.keys()
        )
@ -281,16 +281,21 @@ class ClickHouseMultiModel(ClickHouseModel):
                if import_objects:
                    batches = {}
                    with statsd.timer(statsd_key.format('steps.get_insert_batch')):
-                        for model_cls in cls.sub_models:
+                        def _sub_model_func(model_cls):
                            model_statsd_key = "%s.sync.%s.{0}" % (config.STATSD_PREFIX, model_cls.__name__)
                            with statsd.timer(model_statsd_key.format('steps.get_insert_batch')):
-                                batches[model_cls] = model_cls.get_insert_batch(import_objects)
+                                return model_cls, model_cls.get_insert_batch(import_objects)
+
+                        res = exec_multi_arg_func(_sub_model_func, cls.sub_models, threads_count=len(cls.sub_models))
+                        batches = dict(res)

                    with statsd.timer(statsd_key.format('steps.insert')):
-                        for model_cls, batch in batches.items():
+                        def _sub_model_func(model_cls):
                            model_statsd_key = "%s.sync.%s.{0}" % (config.STATSD_PREFIX, model_cls.__name__)
                            with statsd.timer(model_statsd_key.format('steps.insert')):
-                                model_cls.insert_batch(batch)
+                                model_cls.insert_batch(batches[model_cls])
+
+                        exec_multi_arg_func(_sub_model_func, cls.sub_models, threads_count=len(cls.sub_models))

                with statsd.timer(statsd_key.format('steps.post_sync')):
                    storage.post_sync(import_key)
--- a/src/django_clickhouse/utils.py
+++ b/src/django_clickhouse/utils.py
@ -238,25 +238,25 @@ def exec_in_parallel(func: Callable, args_queue: Queue, threads_count: Optional[
    return results


-def exec_multi_db_func(func: Callable, using: Iterable[str], *args, threads_count: Optional[int] = None,
-                       **kwargs) -> List[Any]:
+def exec_multi_arg_func(func: Callable, split_args: Iterable[Any], *args, threads_count: Optional[int] = None,
+                        **kwargs) -> List[Any]:
    """
-    Executes multiple databases function in parallel threads. Thread functions (func) receive db alias as first argument
+    Executes function in parallel threads. Thread functions (func) receive one of split_args as first argument
    Another arguments passed to functions - args and kwargs
-    If function uses single shard, separate threads are not run, main thread is used.
-    :param func: Function to execute on single database
-    :param using: A list of database aliases to use.
+    If len(split_args) <= 0, separate threads are not run, main thread is used.
+    :param func: Function to execute. Must accept split_arg as first parameter
+    :param split_args: A list of arguments to split threads by
    :param threads_count: Maximum number of threads to run in parallel
    :return: A list of execution results. Order of execution is not guaranteed.
    """
-    using = list(using)
-    if len(using) == 0:
+    split_args = list(split_args)
+    if len(split_args) == 0:
        return []
-    elif len(using) == 1:
-        return [func(using[0], *args, **kwargs)]
+    elif len(split_args) == 1:
+        return [func(split_args[0], *args, **kwargs)]
    else:
        q = Queue()
-        for s in using:
+        for s in split_args:
            q.put(([s] + list(args), kwargs))

        return exec_in_parallel(func, q, threads_count=threads_count)