django-clickhouse/src/django_clickhouse/utils.py

import datetime
from queue import Queue, Empty
from threading import Thread, Lock

import os
from itertools import chain
from typing import Union, Any, Optional, TypeVar, Set, Dict, Iterable, Tuple, Iterator, Callable, List

import pytz
import six
from importlib import import_module
from importlib.util import find_spec
from django.db.models import Model as DjangoModel

from .database import connections


T = TypeVar('T')


def get_tz_offset(db_alias=None):  # type: (Optional[str]) -> int
    """
    Returns ClickHouse server timezone offset in minutes
    :param db_alias: The database alias used
    :return: Integer
    """
    db = connections[db_alias]
    return int(db.server_timezone.utcoffset(datetime.datetime.utcnow()).total_seconds() / 60)


def format_datetime(dt, timezone_offset=0, day_end=False, db_alias=None):
    # type: (Union[datetime.date, datetime.datetime], int, bool, Optional[str]) -> str
    """
    Formats datetime and date objects to format that can be used in WHERE conditions of query
    :param dt: datetime.datetime or datetime.date object
    :param timezone_offset: timezone offset (minutes)
    :param day_end: If datetime.date is given and flag is set, returns day end time, not day start.
    :param db_alias: The database alias used
    :return: A string representing datetime
    """
    assert isinstance(dt, (datetime.datetime, datetime.date)), "dt must be datetime.datetime instance"
    assert type(timezone_offset) is int, "timezone_offset must be integer"

    # datetime.datetime inherits datetime.date. So I can't just make isinstance(dt, datetime.date)
    if not isinstance(dt, datetime.datetime):
        t = datetime.time.max if day_end else datetime.time.min
        dt = datetime.datetime.combine(dt, t)

    # Convert datetime to UTC, if it has timezone
    if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
        dt = pytz.utc.localize(dt)
    else:
        dt = dt.astimezone(pytz.utc)

    # Dates in ClickHouse are parsed in server local timezone. So I need to add server timezone
    server_dt = dt - datetime.timedelta(minutes=timezone_offset - get_tz_offset(db_alias))

    return server_dt.strftime("%Y-%m-%d %H:%M:%S")


def module_exists(module_name):  # type: (str) -> bool
    """
    Checks if moudle exists
    :param module_name: Dot-separated module name
    :return: Boolean
    """
    # Python 3.4+
    spam_spec = find_spec(module_name)
    return spam_spec is not None


def lazy_class_import(obj):  # type: (Union[str, Any]) -> Any
    """
    If string is given, imports object by given module path.
    Otherwise returns the object
    :param obj: A string class path or object to return
    :return: Imported object
    """
    if isinstance(obj, six.string_types):
        module_name, obj_name = obj.rsplit('.', 1)
        module = import_module(module_name)

        try:
            return getattr(module, obj_name)
        except AttributeError:
            raise ImportError('Invalid import path `%s`' % obj)
    else:
        return obj


def get_subclasses(cls, recursive=False):  # type: (T, bool) -> Set[T]
    """
    Gets all subclasses of given class
    Attention!!! Classes would be found only if they were imported before using this function
    :param cls: Class to get subcalsses
    :param recursive: If flag is set, returns subclasses of subclasses and so on too
    :return: A list of subclasses
    """
    subclasses = set(cls.__subclasses__())

    if recursive:
        for subcls in subclasses.copy():
            subclasses.update(get_subclasses(subcls, recursive=True))

    return subclasses


def model_to_dict(instance, fields=None, exclude_fields=None):
    # type: (DjangoModel, Optional[Iterable[str]], Optional[Iterable[str]]) -> Dict[str, Any]
    """
    Standard model_to_dict ignores some fields if they have invalid naming
    :param instance: Object to convert to dictionary
    :param fields: Field list to extract from instance
    :param exclude_fields: Filed list to exclude from extraction
    :return: Serialized dictionary
    """
    data = {}

    opts = instance._meta
    fields = fields or {f.name for f in chain(opts.concrete_fields, opts.private_fields, opts.many_to_many)}

    for name in set(fields) - set(exclude_fields or set()):
        val = getattr(instance, name, None)
        if val is not None:
            data[name] = val

    return data


def check_pid(pid):
    """
    Check For the existence of a unix pid.
    """
    try:
        os.kill(pid, 0)
    except OSError:
        return False
    else:
        return True


def int_ranges(items: Iterable[int]) -> Iterator[Tuple[int, int]]:
    """
    Finds continuous intervals in integer iterable.
    :param items: Items to search in
    :return: Iterator over Tuple[start, end]
    """
    interval_start = None
    prev_item = None
    for item in sorted(items):
        if prev_item is None:
            interval_start = prev_item = item
        elif prev_item + 1 == item:
            prev_item = item
        else:
            interval = interval_start, prev_item
            interval_start = prev_item = item
            yield interval

    if interval_start is None:
        raise StopIteration()
    else:
        yield interval_start, prev_item


class ExceptionThread(Thread):
    """
    Thread objects, which catches thread exceptions and raises them in main thread
    """
    def __init__(self, *args, **kwargs):
        super(ExceptionThread, self).__init__(*args, **kwargs)
        self.exc = None

    def run(self):
        try:
            return super(ExceptionThread, self).run()
        except Exception as e:
            self.exc = e

    def join(self, timeout=None):
        super(ExceptionThread, self).join(timeout=timeout)
        if self.exc:
            raise self.exc


def exec_in_parallel(func: Callable, args_queue: Queue, threads_count: Optional[int] = None) -> List[Any]:
    """
    Executes func in multiple threads in parallel
    Functions are expected to be thread safe. If it needs some locks, func must provide them.
    :param func: Function to execute in thread
    :param args_queue: A queue with arguments for separate function call. Each element is tuple of (args, kwargs)
    :param threads_count: Maximum number of parallel threads tho run
    :return: A list of results. Order of results is not guaranteed. Element types depends func return type.
    """
    results = []
    results_lock = Lock()

    # If thread_count is not given, we execute all tasks in parallel.
    # If queue has less elements than threads_count, take queue size.
    threads_count = min(args_queue.qsize(), threads_count) if threads_count else args_queue.qsize()

    def _worker():
        """
        Thread worker, gets next arguments from queue and processes them.
        Results are put into results array using thread safe lock
        :return: None
        """
        finished = False
        while not finished:
            try:
                # Get arguments
                args, kwargs = args_queue.get_nowait()

                # Execute function
                local_res = func(*args, **kwargs)

                # Write result in lock
                with results_lock:
                    results.append(local_res)

                args_queue.task_done()

            except Empty:
                # No data in queue, finish worker thread
                finished = True

    # Run threads
    threads = []
    for index in range(threads_count):
        t = ExceptionThread(target=_worker)
        threads.append(t)
        t.start()

    # Wait for threads to finish
    for t in threads:
        t.join()

    return results


def exec_multi_arg_func(func: Callable, split_args: Iterable[Any], *args, threads_count: Optional[int] = None,
                        **kwargs) -> List[Any]:
    """
    Executes function in parallel threads. Thread functions (func) receive one of split_args as first argument
    Another arguments passed to functions - args and kwargs
    If len(split_args) <= 0, separate threads are not run, main thread is used.
    :param func: Function to execute. Must accept split_arg as first parameter
    :param split_args: A list of arguments to split threads by
    :param threads_count: Maximum number of threads to run in parallel
    :return: A list of execution results. Order of execution is not guaranteed.
    """
    split_args = list(split_args)
    if len(split_args) == 0:
        return []
    elif len(split_args) == 1:
        return [func(split_args[0], *args, **kwargs)]
    else:
        q = Queue()
        for s in split_args:
            q.put(([s] + list(args), kwargs))

        return exec_in_parallel(func, q, threads_count=threads_count)