From 59564f8c70f614fba2ba900eb0ada8a784db8248 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sun, 10 Sep 2017 17:17:04 +0300 Subject: [PATCH] Add `distinct` method to querysets --- CHANGELOG.md | 1 + docs/class_reference.md | 30 +++++++++++++++++++++-------- docs/querysets.md | 10 ++++++++++ docs/toc.md | 1 + scripts/generate_ref.py | 5 ++++- src/infi/clickhouse_orm/query.py | 33 +++++++++++++++++++++++++------- tests/test_querysets.py | 16 +++++++++++++++- 7 files changed, 79 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bfa0fc..8143834 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ Change Log Unreleased ---------- +- Add `distinct` method to querysets - Add `AlterTableWithBuffer` migration operation v0.9.6 diff --git a/docs/class_reference.md b/docs/class_reference.md index 7e4bc74..f5ef191 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -7,7 +7,7 @@ infi.clickhouse_orm.database ### Database -Database instances connect to a specific ClickHouse database for running queries, +Database instances connect to a specific ClickHouse database for running queries, inserting data and other operations. #### Database(db_name, db_url="http://localhost:8123/", username=None, password=None, readonly=False, autocreate=True) @@ -71,7 +71,7 @@ Insert records into the database. Executes schema migrations. -- `migrations_package_name` - fully qualified name of the Python package +- `migrations_package_name` - fully qualified name of the Python package containing the migrations. - `up_to` - number of the last migration to apply. @@ -89,7 +89,7 @@ Selects records and returns a single page of model instances. - `conditions`: optional SQL conditions (contents of the WHERE clause). - `settings`: query settings to send as HTTP GET parameters -The result is a namedtuple containing `objects` (list), `number_of_objects`, +The result is a namedtuple containing `objects` (list), `number_of_objects`, `pages_total`, `number` (of the current page), and `page_size`. @@ -128,7 +128,7 @@ infi.clickhouse_orm.models A base class for ORM models. Each model class represent a ClickHouse table. For example: - + class CPUStats(Model): timestamp = DateTimeField() cpu_id = UInt16Field() @@ -172,7 +172,7 @@ If omitted, it is assumed to be the names of all fields in the model, in order o #### get_database() -Gets the `Database` that this model instance belongs to. +Gets the `Database` that this model instance belongs to. Returns `None` unless the instance was read from the database or written to it. @@ -191,7 +191,7 @@ Returns a `QuerySet` for selecting instances of this model class. #### set_database(db) -Sets the `Database` that this model instance belongs to. +Sets the `Database` that this model instance belongs to. This is done automatically when the instance is read from the database or written to it. @@ -261,7 +261,7 @@ If omitted, it is assumed to be the names of all fields in the model, in order o #### get_database() -Gets the `Database` that this model instance belongs to. +Gets the `Database` that this model instance belongs to. Returns `None` unless the instance was read from the database or written to it. @@ -280,7 +280,7 @@ Returns a `QuerySet` for selecting instances of this model class. #### set_database(db) -Sets the `Database` that this model instance belongs to. +Sets the `Database` that this model instance belongs to. This is done automatically when the instance is read from the database or written to it. @@ -585,6 +585,13 @@ Returns the contents of the query's `WHERE` clause as a string. Returns the number of matching model instances. +#### distinct() + + +Adds a DISTINCT clause to the query, meaning that any duplicate rows +in the results will be omitted. + + #### exclude(**kwargs) @@ -678,6 +685,13 @@ Returns the contents of the query's `WHERE` clause as a string. Returns the number of rows after aggregation. +#### distinct() + + +Adds a DISTINCT clause to the query, meaning that any duplicate rows +in the results will be omitted. + + #### exclude(**kwargs) diff --git a/docs/querysets.md b/docs/querysets.md index 2bbefd9..d27c836 100644 --- a/docs/querysets.md +++ b/docs/querysets.md @@ -99,6 +99,16 @@ When some of the model fields aren't needed, it is more efficient to omit them f qs = Person.objects_in(database).only('first_name', 'birthday') +Distinct +-------- + +Adds a DISTINCT clause to the query, meaning that any duplicate rows in the results will be omitted. + + >>> Person.objects_in(database).only('first_name').count() + 100 + >>> Person.objects_in(database).only('first_name').distinct().count() + 94 + Slicing ------- diff --git a/docs/toc.md b/docs/toc.md index 0f83389..f994141 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -20,6 +20,7 @@ * [Counting and Checking Existence](querysets.md#counting-and-checking-existence) * [Ordering](querysets.md#ordering) * [Omitting Fields](querysets.md#omitting-fields) + * [Distinct](querysets.md#distinct) * [Slicing](querysets.md#slicing) * [Pagination](querysets.md#pagination) * [Aggregation](querysets.md#aggregation) diff --git a/scripts/generate_ref.py b/scripts/generate_ref.py index c35e881..d2863fd 100644 --- a/scripts/generate_ref.py +++ b/scripts/generate_ref.py @@ -51,7 +51,10 @@ def get_method_sig(method): for arg in argspec.args: default_arg = _get_default_arg(argspec.args, argspec.defaults, arg_index) if default_arg.has_default: - args.append("%s=%s" % (arg, default_arg.default_value)) + val = default_arg.default_value + if isinstance(val, basestring): + val = '"' + val + '"' + args.append("%s=%s" % (arg, val)) else: args.append(arg) arg_index += 1 diff --git a/src/infi/clickhouse_orm/query.py b/src/infi/clickhouse_orm/query.py index c1c1dd0..0bf764a 100644 --- a/src/infi/clickhouse_orm/query.py +++ b/src/infi/clickhouse_orm/query.py @@ -187,6 +187,7 @@ class QuerySet(object): self._q = [] self._fields = [] self._limits = None + self._distinct = False def __iter__(self): """ @@ -228,14 +229,15 @@ class QuerySet(object): """ Returns the whole query as a SQL string. """ + distinct = 'DISTINCT ' if self._distinct else '' fields = '*' if self._fields: fields = comma_join('`%s`' % field for field in self._fields) ordering = '\nORDER BY ' + self.order_by_as_sql() if self._order_by else '' limit = '\nLIMIT %d, %d' % self._limits if self._limits else '' - params = (fields, self._model_cls.table_name(), + params = (distinct, fields, self._model_cls.table_name(), self.conditions_as_sql(), ordering, limit) - return u'SELECT %s\nFROM `%s`\nWHERE %s%s%s' % params + return u'SELECT %s%s\nFROM `%s`\nWHERE %s%s%s' % params def order_by_as_sql(self): """ @@ -259,6 +261,11 @@ class QuerySet(object): """ Returns the number of matching model instances. """ + if self._distinct: + # Use a subquery, since a simple count won't be accurate + sql = u'SELECT count() FROM (%s)' % self.as_sql() + raw = self._database.raw(sql) + return int(raw) if raw else 0 return self._database.count(self._model_cls, self.conditions_as_sql()) def order_by(self, *field_names): @@ -296,7 +303,7 @@ class QuerySet(object): return qs def paginate(self, page_num=1, page_size=100): - ''' + """ Returns a single page of model instances that match the queryset. Note that `order_by` should be used first, to ensure a correct partitioning of records into pages. @@ -306,7 +313,7 @@ class QuerySet(object): The result is a namedtuple containing `objects` (list), `number_of_objects`, `pages_total`, `number` (of the current page), and `page_size`. - ''' + """ from .database import Page count = self.count() pages_total = int(ceil(count / float(page_size))) @@ -323,8 +330,17 @@ class QuerySet(object): page_size=page_size ) + def distinct(self): + """ + Adds a DISTINCT clause to the query, meaning that any duplicate rows + in the results will be omitted. + """ + qs = copy(self) + qs._distinct = True + return qs + def aggregate(self, *args, **kwargs): - ''' + """ Returns an `AggregateQuerySet` over this query, with `args` serving as grouping fields and `kwargs` serving as calculated fields. At least one calculated field is required. For example: @@ -337,7 +353,7 @@ class QuerySet(object): WHERE data > '2017-08-01' GROUP BY event_type ``` - ''' + """ return AggregateQuerySet(self, args, kwargs) @@ -368,6 +384,7 @@ class AggregateQuerySet(QuerySet): self._order_by = list(base_qs._order_by) self._q = list(base_qs._q) self._limits = base_qs._limits + self._distinct = base_qs._distinct def group_by(self, *args): """ @@ -398,15 +415,17 @@ class AggregateQuerySet(QuerySet): """ Returns the whole query as a SQL string. """ + distinct = 'DISTINCT ' if self._distinct else '' grouping = comma_join('`%s`' % field for field in self._grouping_fields) fields = comma_join(list(self._fields) + ['%s AS %s' % (v, k) for k, v in self._calculated_fields.items()]) params = dict( + distinct=distinct, grouping=grouping or "''", fields=fields, table=self._model_cls.table_name(), conds=self.conditions_as_sql() ) - sql = u'SELECT %(fields)s\nFROM `%(table)s`\nWHERE %(conds)s\nGROUP BY %(grouping)s' % params + sql = u'SELECT %(distinct)s%(fields)s\nFROM `%(table)s`\nWHERE %(conds)s\nGROUP BY %(grouping)s' % params if self._order_by: sql += '\nORDER BY ' + self.order_by_as_sql() if self._limits: diff --git a/tests/test_querysets.py b/tests/test_querysets.py index ad834bb..cbbc65d 100644 --- a/tests/test_querysets.py +++ b/tests/test_querysets.py @@ -21,8 +21,11 @@ class QuerySetTestCase(TestCaseWithData): def _test_qs(self, qs, expected_count): logging.info(qs.as_sql()) + count = 0 for instance in qs: - logging.info('\t%s' % instance.to_dict()) + count += 1 + logging.info('\t[%d]\t%s' % (count, instance.to_dict())) + self.assertEquals(count, expected_count) self.assertEquals(qs.count(), expected_count) def test_no_filtering(self): @@ -202,6 +205,11 @@ class QuerySetTestCase(TestCaseWithData): page = qs.paginate(1, 100) self.assertEquals(page.number_of_objects, 10) + def test_distinct(self): + qs = Person.objects_in(self.database).distinct() + self._test_qs(qs, 100) + self._test_qs(qs.only('first_name'), 94) + class AggregateTestCase(TestCaseWithData): @@ -310,6 +318,12 @@ class AggregateTestCase(TestCaseWithData): qs = qs.filter(weekday=1) self.assertEquals(qs.count(), 1) + def test_aggregate_with_distinct(self): + # In this case distinct has no effect + qs = Person.objects_in(self.database).aggregate(average_height='avg(height)').distinct() + print(qs.as_sql()) + self.assertEquals(qs.count(), 1) + Color = Enum('Color', u'red blue green yellow brown white black')