mirror of
https://github.com/carrotquest/django-clickhouse.git
synced 2024-11-11 03:46:46 +03:00
Speed up of get_sync_objects (parallel execution on multiple databases)
This commit is contained in:
parent
073e002125
commit
96b625ecff
|
@ -19,7 +19,7 @@ from .exceptions import RedisLockTimeoutError
|
|||
from .models import ClickHouseSyncModel
|
||||
from .query import QuerySet
|
||||
from .serializers import Django2ClickHouseModelSerializer
|
||||
from .utils import lazy_class_import
|
||||
from .utils import lazy_class_import, exec_multi_db_func
|
||||
|
||||
|
||||
class ClickHouseModelMeta(InfiModelBase):
|
||||
|
@ -159,8 +159,12 @@ class ClickHouseModel(with_metaclass(ClickHouseModelMeta, InfiModel)):
|
|||
using, pk = pk_str.split('.')
|
||||
pk_by_db[using].add(pk)
|
||||
|
||||
objs = chain(*(cls.get_sync_query_set(using, pk_set) for using, pk_set in pk_by_db.items()))
|
||||
return list(objs)
|
||||
# Selecting data from multiple databases should work faster in parallel, if connections are independent.
|
||||
objs = exec_multi_db_func(
|
||||
lambda db_alias: cls.get_sync_query_set(db_alias, pk_by_db[db_alias]),
|
||||
pk_by_db.keys()
|
||||
)
|
||||
return list(chain(*objs))
|
||||
|
||||
@classmethod
|
||||
def get_insert_batch(cls, import_objects): # type: (Iterable[DjangoModel]) -> List[ClickHouseModel]
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
import datetime
|
||||
from queue import Queue, Empty
|
||||
from threading import Thread, Lock
|
||||
|
||||
import os
|
||||
from itertools import chain
|
||||
from typing import Union, Any, Optional, TypeVar, Set, Dict, Iterable, Tuple, Iterator
|
||||
from typing import Union, Any, Optional, TypeVar, Set, Dict, Iterable, Tuple, Iterator, Callable, List
|
||||
|
||||
import pytz
|
||||
import six
|
||||
|
@ -158,3 +161,102 @@ def int_ranges(items: Iterable[int]) -> Iterator[Tuple[int, int]]:
|
|||
raise StopIteration()
|
||||
else:
|
||||
yield interval_start, prev_item
|
||||
|
||||
|
||||
class ExceptionThread(Thread):
|
||||
"""
|
||||
Thread objects, which catches thread exceptions and raises them in main thread
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ExceptionThread, self).__init__(*args, **kwargs)
|
||||
self.exc = None
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
return super(ExceptionThread, self).run()
|
||||
except Exception as e:
|
||||
self.exc = e
|
||||
|
||||
def join(self, timeout=None):
|
||||
super(ExceptionThread, self).join(timeout=timeout)
|
||||
if self.exc:
|
||||
raise self.exc
|
||||
|
||||
|
||||
def exec_in_parallel(func: Callable, args_queue: Queue, threads_count: Optional[int] = None) -> List[Any]:
|
||||
"""
|
||||
Executes func in multiple threads in parallel
|
||||
Functions are expected to be thread safe. If it needs some locks, func must provide them.
|
||||
:param func: Function to execute in thread
|
||||
:param args_queue: A queue with arguments for separate function call. Each element is tuple of (args, kwargs)
|
||||
:param threads_count: Maximum number of parallel threads tho run
|
||||
:return: A list of results. Order of results is not guaranteed. Element types depends func return type.
|
||||
"""
|
||||
results = []
|
||||
results_lock = Lock()
|
||||
|
||||
# If thread_count is not given, we execute all tasks in parallel.
|
||||
# If queue has less elements than threads_count, take queue size.
|
||||
threads_count = min(args_queue.qsize(), threads_count) if threads_count else args_queue.qsize()
|
||||
|
||||
def _worker():
|
||||
"""
|
||||
Thread worker, gets next arguments from queue and processes them.
|
||||
Results are put into results array using thread safe lock
|
||||
:return: None
|
||||
"""
|
||||
finished = False
|
||||
while not finished:
|
||||
try:
|
||||
# Get arguments
|
||||
args, kwargs = args_queue.get_nowait()
|
||||
|
||||
# Execute function
|
||||
local_res = func(*args, **kwargs)
|
||||
|
||||
# Write result in lock
|
||||
with results_lock:
|
||||
results.append(local_res)
|
||||
|
||||
args_queue.task_done()
|
||||
|
||||
except Empty:
|
||||
# No data in queue, finish worker thread
|
||||
finished = True
|
||||
|
||||
# Run threads
|
||||
threads = []
|
||||
for index in range(threads_count):
|
||||
t = ExceptionThread(target=_worker)
|
||||
threads.append(t)
|
||||
t.start()
|
||||
|
||||
# Wait for threads to finish
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def exec_multi_db_func(func: Callable, using: Iterable[str], *args, threads_count: Optional[int] = None,
|
||||
**kwargs) -> List[Any]:
|
||||
"""
|
||||
Executes multiple databases function in parallel threads. Thread functions (func) receive db alias as first argument
|
||||
Another arguments passed to functions - args and kwargs
|
||||
If function uses single shard, separate threads are not run, main thread is used.
|
||||
:param func: Function to execute on single database
|
||||
:param using: A list of database aliases to use.
|
||||
:param threads_count: Maximum number of threads to run in parallel
|
||||
:return: A list of execution results. Order of execution is not guaranteed.
|
||||
"""
|
||||
using = list(using)
|
||||
if len(using) == 0:
|
||||
return []
|
||||
elif len(using) == 1:
|
||||
return [func(using[0], *args, **kwargs)]
|
||||
else:
|
||||
q = Queue()
|
||||
for s in using:
|
||||
q.put(([s] + list(args), kwargs))
|
||||
|
||||
return exec_in_parallel(func, q, threads_count=threads_count)
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
import datetime
|
||||
from queue import Queue
|
||||
|
||||
import pytz
|
||||
from django.test import TestCase
|
||||
|
||||
from django_clickhouse.models import ClickHouseSyncModel
|
||||
from django_clickhouse.utils import get_tz_offset, format_datetime, lazy_class_import, int_ranges
|
||||
from django_clickhouse.utils import get_tz_offset, format_datetime, lazy_class_import, int_ranges, exec_in_parallel
|
||||
|
||||
|
||||
class GetTZOffsetTest(TestCase):
|
||||
|
@ -67,3 +68,34 @@ class TestIntRanges(TestCase):
|
|||
def test_bounds(self):
|
||||
self.assertListEqual([(1, 1), (5, 6), (10, 10)],
|
||||
list(int_ranges([1, 5, 6, 10])))
|
||||
|
||||
|
||||
class TestExecInParallel(TestCase):
|
||||
base_classes = []
|
||||
|
||||
def test_exec(self):
|
||||
q = Queue()
|
||||
for i in range(10):
|
||||
q.put(([i], {}))
|
||||
|
||||
res = exec_in_parallel(lambda x: x*x, q, 4)
|
||||
self.assertSetEqual({x * x for x in range(10)}, set(res))
|
||||
|
||||
def test_exec_no_count(self):
|
||||
q = Queue()
|
||||
for i in range(10):
|
||||
q.put(([i], {}))
|
||||
|
||||
res = exec_in_parallel(lambda x: x * x, q)
|
||||
self.assertSetEqual({x * x for x in range(10)}, set(res))
|
||||
|
||||
def test_exception(self):
|
||||
q = Queue()
|
||||
for i in range(10):
|
||||
q.put(([i], {}))
|
||||
|
||||
def _test_func(x):
|
||||
raise TypeError("Exception in thread %d" % x)
|
||||
|
||||
with self.assertRaises(TypeError):
|
||||
exec_in_parallel(_test_func, q)
|
||||
|
|
Loading…
Reference in New Issue
Block a user