infi.clickhouse_orm/src/infi/clickhouse_orm/query.py

from __future__ import unicode_literals

import pytz
from copy import copy, deepcopy
from math import ceil
from datetime import date, datetime
from .utils import comma_join, string_or_func, arg_to_sql


# TODO
# - check that field names are valid

class Operator(object):
    """
    Base class for filtering operators.
    """

    def to_sql(self, model_cls, field_name, value):
        """
        Subclasses should implement this method. It returns an SQL string
        that applies this operator on the given field and value.
        """
        raise NotImplementedError   # pragma: no cover

    def _value_to_sql(self, field, value, quote=True):
        from infi.clickhouse_orm.funcs import F
        if isinstance(value, F):
            return value.to_sql()
        return field.to_db_string(field.to_python(value, pytz.utc), quote)


class SimpleOperator(Operator):
    """
    A simple binary operator such as a=b, a<b, a>b etc.
    """

    def __init__(self, sql_operator, sql_for_null=None):
        self._sql_operator = sql_operator
        self._sql_for_null = sql_for_null

    def to_sql(self, model_cls, field_name, value):
        field = getattr(model_cls, field_name)
        value = self._value_to_sql(field, value)
        if value == '\\N' and self._sql_for_null is not None:
            return ' '.join([field_name, self._sql_for_null])
        return ' '.join([field_name, self._sql_operator, value])


class InOperator(Operator):
    """
    An operator that implements IN.
    Accepts 3 different types of values:
    - a list or tuple of simple values
    - a string (used verbatim as the contents of the parenthesis)
    - a queryset (subquery)
    """

    def to_sql(self, model_cls, field_name, value):
        field = getattr(model_cls, field_name)
        if isinstance(value, QuerySet):
            value = value.as_sql()
        elif isinstance(value, str):
            pass
        else:
            value = comma_join([self._value_to_sql(field, v) for v in value])
        return '%s IN (%s)' % (field_name, value)


class LikeOperator(Operator):
    """
    A LIKE operator that matches the field to a given pattern. Can be
    case sensitive or insensitive.
    """

    def __init__(self, pattern, case_sensitive=True):
        self._pattern = pattern
        self._case_sensitive = case_sensitive

    def to_sql(self, model_cls, field_name, value):
        field = getattr(model_cls, field_name)
        value = self._value_to_sql(field, value, quote=False)
        value = value.replace('\\', '\\\\').replace('%', '\\\\%').replace('_', '\\\\_')
        pattern = self._pattern.format(value)
        if self._case_sensitive:
            return '%s LIKE \'%s\'' % (field_name, pattern)
        else:
            return 'lowerUTF8(%s) LIKE lowerUTF8(\'%s\')' % (field_name, pattern)


class IExactOperator(Operator):
    """
    An operator for case insensitive string comparison.
    """

    def to_sql(self, model_cls, field_name, value):
        field = getattr(model_cls, field_name)
        value = self._value_to_sql(field, value)
        return 'lowerUTF8(%s) = lowerUTF8(%s)' % (field_name, value)


class NotOperator(Operator):
    """
    A wrapper around another operator, which negates it.
    """

    def __init__(self, base_operator):
        self._base_operator = base_operator

    def to_sql(self, model_cls, field_name, value):
        # Negate the base operator
        return 'NOT (%s)' % self._base_operator.to_sql(model_cls, field_name, value)


class BetweenOperator(Operator):
    """
    An operator that implements BETWEEN.
    Accepts list or tuple of two elements and generates sql condition:
    - 'BETWEEN value[0] AND value[1]' if value[0] and value[1] are not None and not empty
    Then imitations of BETWEEN, where one of two limits is missing
    - '>= value[0]' if value[1] is None or empty
    - '<= value[1]' if value[0] is None or empty
    """

    def to_sql(self, model_cls, field_name, value):
        field = getattr(model_cls, field_name)
        value0 = self._value_to_sql(field, value[0]) if value[0] is not None or len(str(value[0])) > 0 else None
        value1 = self._value_to_sql(field, value[1]) if value[1] is not None or len(str(value[1])) > 0 else None
        if value0 and value1:
            return '%s BETWEEN %s AND %s' % (field_name, value0, value1)
        if value0 and not value1:
            return ' '.join([field_name, '>=', value0])
        if value1 and not value0:
            return ' '.join([field_name, '<=', value1])

# Define the set of builtin operators

_operators = {}

def register_operator(name, sql):
    _operators[name] = sql

register_operator('eq',          SimpleOperator('=', 'IS NULL'))
register_operator('ne',          SimpleOperator('!=', 'IS NOT NULL'))
register_operator('gt',          SimpleOperator('>'))
register_operator('gte',         SimpleOperator('>='))
register_operator('lt',          SimpleOperator('<'))
register_operator('lte',         SimpleOperator('<='))
register_operator('between',     BetweenOperator())
register_operator('in',          InOperator())
register_operator('not_in',      NotOperator(InOperator()))
register_operator('contains',    LikeOperator('%{}%'))
register_operator('startswith',  LikeOperator('{}%'))
register_operator('endswith',    LikeOperator('%{}'))
register_operator('icontains',   LikeOperator('%{}%', False))
register_operator('istartswith', LikeOperator('{}%', False))
register_operator('iendswith',   LikeOperator('%{}', False))
register_operator('iexact',      IExactOperator())


class Cond(object):
    """
    An abstract object for storing a single query condition Field + Operator + Value.
    """

    def to_sql(self, model_cls):
        raise NotImplementedError


class FieldCond(Cond):
    """
    A single query condition made up of Field + Operator + Value.
    """
    def __init__(self, field_name, operator, value):
        self._field_name = field_name
        self._operator = _operators.get(operator)
        if self._operator is None:
            # The field name contains __ like my__field
            self._field_name = field_name + '__' + operator
            self._operator = _operators['eq']
        self._value = value

    def to_sql(self, model_cls):
        return self._operator.to_sql(model_cls, self._field_name, self._value)

    def __deepcopy__(self, memodict={}):
        res = copy(self)
        res._value = deepcopy(self._value)
        return res


class Q(object):

    AND_MODE = 'AND'
    OR_MODE = 'OR'

    def __init__(self, *filter_funcs, **filter_fields):
        self._conds = list(filter_funcs) + [self._build_cond(k, v) for k, v in filter_fields.items()]
        self._children = []
        self._negate = False
        self._mode = self.AND_MODE

    @property
    def is_empty(self):
        """
        Checks if there are any conditions in Q object
        Returns: Boolean
        """
        return not bool(self._conds or self._children)

    @classmethod
    def _construct_from(cls, l_child, r_child, mode):
        if mode == l_child._mode:
            q = deepcopy(l_child)
            q._children.append(deepcopy(r_child))
        elif mode == r_child._mode:
            q = deepcopy(r_child)
            q._children.append(deepcopy(l_child))
        else:
            # Different modes
            q = Q()
            q._children = [l_child, r_child]
            q._mode = mode  # AND/OR

        return q

    def _build_cond(self, key, value):
        if '__' in key:
            field_name, operator = key.rsplit('__', 1)
        else:
            field_name, operator = key, 'eq'
        return FieldCond(field_name, operator, value)

    def to_sql(self, model_cls):
        condition_sql = []

        if self._conds:
            condition_sql.extend([cond.to_sql(model_cls) for cond in self._conds])

        if self._children:
            condition_sql.extend([child.to_sql(model_cls) for child in self._children if child])

        if not condition_sql:
            # Empty Q() object returns everything
            sql = '1'
        elif len(condition_sql) == 1:
            # Skip not needed brackets over single condition
            sql = condition_sql[0]
        else:
            # Each condition must be enclosed in brackets, or order of operations may be wrong
            sql = '(%s)' % ') {} ('.format(self._mode).join(condition_sql)

        if self._negate:
            sql = 'NOT (%s)' % sql

        return sql

    def __or__(self, other):
        return Q._construct_from(self, other, self.OR_MODE)

    def __and__(self, other):
        return Q._construct_from(self, other, self.AND_MODE)

    def __invert__(self):
        q = copy(self)
        q._negate = True
        return q

    def __bool__(self):
        return not self.is_empty

    def __deepcopy__(self, memodict={}):
        q = Q()
        q._conds = [deepcopy(cond) for cond in self._conds]
        q._negate = self._negate
        q._mode = self._mode

        if self._children:
            q._children = [deepcopy(child) for child in self._children]

        return q


class QuerySet(object):
    """
    A queryset is an object that represents a database query using a specific `Model`.
    It is lazy, meaning that it does not hit the database until you iterate over its
    matching rows (model instances).
    """

    def __init__(self, model_cls, database):
        """
        Initializer. It is possible to create a queryset like this, but the standard
        way is to use `MyModel.objects_in(database)`.
        """
        self._model_cls = model_cls
        self._database = database
        self._order_by = []
        self._where_q = Q()
        self._prewhere_q = Q()
        self._grouping_fields = []
        self._grouping_with_totals = False
        self._fields = model_cls.fields().keys()
        self._limits = None
        self._limit_by = None
        self._limit_by_fields = None
        self._distinct = False
        self._final = False

    def __iter__(self):
        """
        Iterates over the model instances matching this queryset
        """
        return self._database.select(self.as_sql(), self._model_cls)

    def __bool__(self):
        """
        Returns true if this queryset matches any rows.
        """
        return bool(self.count())

    def __nonzero__(self):      # Python 2 compatibility
        return type(self).__bool__(self)

    def __str__(self):
        return self.as_sql()

    def __getitem__(self, s):
        if isinstance(s, int):
            # Single index
            assert s >= 0, 'negative indexes are not supported'
            qs = copy(self)
            qs._limits = (s, 1)
            return next(iter(qs))
        else:
            # Slice
            assert s.step in (None, 1), 'step is not supported in slices'
            start = s.start or 0
            stop = s.stop or 2**63 - 1
            assert start >= 0 and stop >= 0, 'negative indexes are not supported'
            assert start <= stop, 'start of slice cannot be smaller than its end'
            qs = copy(self)
            qs._limits = (start, stop - start)
            return qs

    def limit_by(self, offset_limit, *fields_or_expr):
        """
        Adds a LIMIT BY clause to the query.
        - `offset_limit`: either an integer specifying the limit, or a tuple of integers (offset, limit).
        - `fields_or_expr`: the field names or expressions to use in the clause.
        """
        if isinstance(offset_limit, int):
            # Single limit
            offset_limit = (0, offset_limit)
        offset = offset_limit[0]
        limit = offset_limit[1]
        assert offset >= 0 and limit >= 0, 'negative limits are not supported'
        qs = copy(self)
        qs._limit_by = (offset, limit)
        qs._limit_by_fields = fields_or_expr
        return qs

    def select_fields_as_sql(self):
        """
        Returns the selected fields or expressions as a SQL string.
        """
        fields = '*'
        if self._fields:
            fields = comma_join('`%s`' % field for field in self._fields)
        return fields

    def as_sql(self):
        """
        Returns the whole query as a SQL string.
        """
        distinct = 'DISTINCT ' if self._distinct else ''
        final = ' FINAL' if self._final else ''
        table_name = '`%s`' % self._model_cls.table_name()
        if self._model_cls.is_system_model():
            table_name = '`system`.' + table_name
        params = (distinct, self.select_fields_as_sql(), table_name, final)
        sql = u'SELECT %s%s\nFROM %s%s' % params

        if self._prewhere_q and not self._prewhere_q.is_empty:
            sql += '\nPREWHERE ' + self.conditions_as_sql(prewhere=True)

        if self._where_q and not self._where_q.is_empty:
            sql += '\nWHERE ' + self.conditions_as_sql(prewhere=False)

        if self._grouping_fields:
            sql += '\nGROUP BY %s' % comma_join('`%s`' % field for field in self._grouping_fields)

            if self._grouping_with_totals:
                sql += ' WITH TOTALS'

        if self._order_by:
            sql += '\nORDER BY ' + self.order_by_as_sql()

        if self._limit_by:
            sql += '\nLIMIT %d, %d' % self._limit_by
            sql += ' BY %s' % comma_join(string_or_func(field) for field in self._limit_by_fields)

        if self._limits:
            sql += '\nLIMIT %d, %d' % self._limits

        return sql

    def order_by_as_sql(self):
        """
        Returns the contents of the query's `ORDER BY` clause as a string.
        """
        return comma_join([
            '%s DESC' % field[1:] if isinstance(field, str) and field[0] == '-' else str(field)
            for field in self._order_by
        ])

    def conditions_as_sql(self, prewhere=False):
        """
        Returns the contents of the query's `WHERE` or `PREWHERE` clause as a string.
        """
        q_object = self._prewhere_q if prewhere else self._where_q
        return q_object.to_sql(self._model_cls)

    def count(self):
        """
        Returns the number of matching model instances.
        """
        if self._distinct or self._limits:
            # Use a subquery, since a simple count won't be accurate
            sql = u'SELECT count() FROM (%s)' % self.as_sql()
            raw = self._database.raw(sql)
            return int(raw) if raw else 0

        # Simple case
        conditions = (self._where_q & self._prewhere_q).to_sql(self._model_cls)
        return self._database.count(self._model_cls, conditions)

    def order_by(self, *field_names):
        """
        Returns a copy of this queryset with the ordering changed.
        """
        qs = copy(self)
        qs._order_by = field_names
        return qs

    def only(self, *field_names):
        """
        Returns a copy of this queryset limited to the specified field names.
        Useful when there are large fields that are not needed,
        or for creating a subquery to use with an IN operator.
        """
        qs = copy(self)
        qs._fields = field_names
        return qs

    def _filter_or_exclude(self, *q, **kwargs):
        from .funcs import F

        inverse = kwargs.pop('_inverse', False)
        prewhere = kwargs.pop('prewhere', False)

        qs = copy(self)

        condition = Q()
        for arg in q:
            if isinstance(arg, Q):
                condition &= arg
            elif isinstance(arg, F):
                condition &= Q(arg)
            else:
                raise TypeError('Invalid argument "%r" to queryset filter' % arg)

        if kwargs:
            condition &= Q(**kwargs)

        if inverse:
            condition = ~condition

        condition = copy(self._prewhere_q if prewhere else self._where_q) & condition
        if prewhere:
            qs._prewhere_q = condition
        else:
            qs._where_q = condition

        return qs

    def filter(self, *q, **kwargs):
        """
        Returns a copy of this queryset that includes only rows matching the conditions.
        Pass `prewhere=True` to apply the conditions as PREWHERE instead of WHERE.
        """
        return self._filter_or_exclude(*q, **kwargs)

    def exclude(self, *q, **kwargs):
        """
        Returns a copy of this queryset that excludes all rows matching the conditions.
        Pass `prewhere=True` to apply the conditions as PREWHERE instead of WHERE.
        """
        return self._filter_or_exclude(*q, _inverse=True, **kwargs)

    def paginate(self, page_num=1, page_size=100):
        """
        Returns a single page of model instances that match the queryset.
        Note that `order_by` should be used first, to ensure a correct
        partitioning of records into pages.

        - `page_num`: the page number (1-based), or -1 to get the last page.
        - `page_size`: number of records to return per page.

        The result is a namedtuple containing `objects` (list), `number_of_objects`,
        `pages_total`, `number` (of the current page), and `page_size`.
        """
        from .database import Page
        count = self.count()
        pages_total = int(ceil(count / float(page_size)))
        if page_num == -1:
            page_num = pages_total
        elif page_num < 1:
            raise ValueError('Invalid page number: %d' % page_num)
        offset = (page_num - 1) * page_size
        return Page(
            objects=list(self[offset : offset + page_size]),
            number_of_objects=count,
            pages_total=pages_total,
            number=page_num,
            page_size=page_size
        )

    def distinct(self):
        """
        Adds a DISTINCT clause to the query, meaning that any duplicate rows
        in the results will be omitted.
        """
        qs = copy(self)
        qs._distinct = True
        return qs

    def final(self):
        """
        Adds a FINAL modifier to table, meaning data will be collapsed to final version.
        Can be used with the `CollapsingMergeTree` and `ReplacingMergeTree` engines only.
        """
        from .engines import CollapsingMergeTree, ReplacingMergeTree
        if not isinstance(self._model_cls.engine, (CollapsingMergeTree, ReplacingMergeTree)):
            raise TypeError('final() method can be used only with the CollapsingMergeTree and ReplacingMergeTree engines')

        qs = copy(self)
        qs._final = True
        return qs

    def delete(self):
        """
        Deletes all records matched by this queryset's conditions.
        Note that ClickHouse performs deletions in the background, so they are not immediate.
        """
        self._verify_mutation_allowed()
        conditions = (self._where_q & self._prewhere_q).to_sql(self._model_cls)
        sql = 'ALTER TABLE $db.`%s` DELETE WHERE %s' % (self._model_cls.table_name(), conditions)
        self._database.raw(sql)
        return self

    def update(self, **kwargs):
        """
        Updates all records matched by this queryset's conditions.
        Keyword arguments specify the field names and expressions to use for the update.
        Note that ClickHouse performs updates in the background, so they are not immediate.
        """
        assert kwargs, 'No fields specified for update'
        self._verify_mutation_allowed()
        fields = comma_join('`%s` = %s' % (name, arg_to_sql(expr)) for name, expr in kwargs.items())
        conditions = (self._where_q & self._prewhere_q).to_sql(self._model_cls)
        sql = 'ALTER TABLE $db.`%s` UPDATE %s WHERE %s' % (self._model_cls.table_name(), fields, conditions)
        self._database.raw(sql)
        return self

    def _verify_mutation_allowed(self):
        '''
        Checks that the queryset's state allows mutations. Raises an AssertionError if not.
        '''
        assert not self._limits, 'Mutations are not allowed after slicing the queryset'
        assert not self._limit_by, 'Mutations are not allowed after calling limit_by(...)'
        assert not self._distinct, 'Mutations are not allowed after calling distinct()'
        assert not self._final, 'Mutations are not allowed after calling final()'

    def aggregate(self, *args, **kwargs):
        """
        Returns an `AggregateQuerySet` over this query, with `args` serving as
        grouping fields and `kwargs` serving as calculated fields. At least one
        calculated field is required. For example:
        ```
            Event.objects_in(database).filter(date__gt='2017-08-01').aggregate('event_type', count='count()')
        ```
        is equivalent to:
        ```
            SELECT event_type, count() AS count FROM event
            WHERE data > '2017-08-01'
            GROUP BY event_type
        ```
        """
        return AggregateQuerySet(self, args, kwargs)


class AggregateQuerySet(QuerySet):
    """
    A queryset used for aggregation.
    """

    def __init__(self, base_qs, grouping_fields, calculated_fields):
        """
        Initializer. Normally you should not call this but rather use `QuerySet.aggregate()`.

        The grouping fields should be a list/tuple of field names from the model. For example:
        ```
            ('event_type', 'event_subtype')
        ```
        The calculated fields should be a mapping from name to a ClickHouse aggregation function. For example:
        ```
            {'weekday': 'toDayOfWeek(event_date)', 'number_of_events': 'count()'}
        ```
        At least one calculated field is required.
        """
        super(AggregateQuerySet, self).__init__(base_qs._model_cls, base_qs._database)
        assert calculated_fields, 'No calculated fields specified for aggregation'
        self._fields = grouping_fields
        self._grouping_fields = grouping_fields
        self._calculated_fields = calculated_fields
        self._order_by = list(base_qs._order_by)
        self._where_q = base_qs._where_q
        self._prewhere_q = base_qs._prewhere_q
        self._limits = base_qs._limits
        self._distinct = base_qs._distinct

    def group_by(self, *args):
        """
        This method lets you specify the grouping fields explicitly. The `args` must
        be names of grouping fields or calculated fields that this queryset was
        created with.
        """
        for name in args:
            assert name in self._fields or name in self._calculated_fields, \
                   'Cannot group by `%s` since it is not included in the query' % name
        qs = copy(self)
        qs._grouping_fields = args
        return qs

    def only(self, *field_names):
        """
        This method is not supported on `AggregateQuerySet`.
        """
        raise NotImplementedError('Cannot use "only" with AggregateQuerySet')

    def aggregate(self, *args, **kwargs):
        """
        This method is not supported on `AggregateQuerySet`.
        """
        raise NotImplementedError('Cannot re-aggregate an AggregateQuerySet')

    def select_fields_as_sql(self):
        """
        Returns the selected fields or expressions as a SQL string.
        """
        return comma_join([str(f) for f in self._fields] + ['%s AS %s' % (v, k) for k, v in self._calculated_fields.items()])

    def __iter__(self):
        return self._database.select(self.as_sql()) # using an ad-hoc model

    def count(self):
        """
        Returns the number of rows after aggregation.
        """
        sql = u'SELECT count() FROM (%s)' % self.as_sql()
        raw = self._database.raw(sql)
        return int(raw) if raw else 0

    def with_totals(self):
        """
        Adds WITH TOTALS modifier ot GROUP BY, making query return extra row
        with aggregate function calculated across all the rows. More information:
        https://clickhouse.tech/docs/en/query_language/select/#with-totals-modifier
        """
        qs = copy(self)
        qs._grouping_with_totals = True
        return qs

    def _verify_mutation_allowed(self):
        raise AssertionError('Cannot mutate an AggregateQuerySet')


# Expose only relevant classes in import *
__all__ = [c.__name__ for c in [Q, QuerySet, AggregateQuerySet]]