Add distinct method to querysets

This commit is contained in:
Itai Shirav 2017-09-10 17:17:04 +03:00
parent 7bbcae574a
commit 59564f8c70
7 changed files with 79 additions and 17 deletions

View File

@ -3,6 +3,7 @@ Change Log
Unreleased Unreleased
---------- ----------
- Add `distinct` method to querysets
- Add `AlterTableWithBuffer` migration operation - Add `AlterTableWithBuffer` migration operation
v0.9.6 v0.9.6

View File

@ -7,7 +7,7 @@ infi.clickhouse_orm.database
### Database ### Database
Database instances connect to a specific ClickHouse database for running queries, Database instances connect to a specific ClickHouse database for running queries,
inserting data and other operations. inserting data and other operations.
#### Database(db_name, db_url="http://localhost:8123/", username=None, password=None, readonly=False, autocreate=True) #### Database(db_name, db_url="http://localhost:8123/", username=None, password=None, readonly=False, autocreate=True)
@ -71,7 +71,7 @@ Insert records into the database.
Executes schema migrations. Executes schema migrations.
- `migrations_package_name` - fully qualified name of the Python package - `migrations_package_name` - fully qualified name of the Python package
containing the migrations. containing the migrations.
- `up_to` - number of the last migration to apply. - `up_to` - number of the last migration to apply.
@ -89,7 +89,7 @@ Selects records and returns a single page of model instances.
- `conditions`: optional SQL conditions (contents of the WHERE clause). - `conditions`: optional SQL conditions (contents of the WHERE clause).
- `settings`: query settings to send as HTTP GET parameters - `settings`: query settings to send as HTTP GET parameters
The result is a namedtuple containing `objects` (list), `number_of_objects`, The result is a namedtuple containing `objects` (list), `number_of_objects`,
`pages_total`, `number` (of the current page), and `page_size`. `pages_total`, `number` (of the current page), and `page_size`.
@ -128,7 +128,7 @@ infi.clickhouse_orm.models
A base class for ORM models. Each model class represent a ClickHouse table. For example: A base class for ORM models. Each model class represent a ClickHouse table. For example:
class CPUStats(Model): class CPUStats(Model):
timestamp = DateTimeField() timestamp = DateTimeField()
cpu_id = UInt16Field() cpu_id = UInt16Field()
@ -172,7 +172,7 @@ If omitted, it is assumed to be the names of all fields in the model, in order o
#### get_database() #### get_database()
Gets the `Database` that this model instance belongs to. Gets the `Database` that this model instance belongs to.
Returns `None` unless the instance was read from the database or written to it. Returns `None` unless the instance was read from the database or written to it.
@ -191,7 +191,7 @@ Returns a `QuerySet` for selecting instances of this model class.
#### set_database(db) #### set_database(db)
Sets the `Database` that this model instance belongs to. Sets the `Database` that this model instance belongs to.
This is done automatically when the instance is read from the database or written to it. This is done automatically when the instance is read from the database or written to it.
@ -261,7 +261,7 @@ If omitted, it is assumed to be the names of all fields in the model, in order o
#### get_database() #### get_database()
Gets the `Database` that this model instance belongs to. Gets the `Database` that this model instance belongs to.
Returns `None` unless the instance was read from the database or written to it. Returns `None` unless the instance was read from the database or written to it.
@ -280,7 +280,7 @@ Returns a `QuerySet` for selecting instances of this model class.
#### set_database(db) #### set_database(db)
Sets the `Database` that this model instance belongs to. Sets the `Database` that this model instance belongs to.
This is done automatically when the instance is read from the database or written to it. This is done automatically when the instance is read from the database or written to it.
@ -585,6 +585,13 @@ Returns the contents of the query's `WHERE` clause as a string.
Returns the number of matching model instances. Returns the number of matching model instances.
#### distinct()
Adds a DISTINCT clause to the query, meaning that any duplicate rows
in the results will be omitted.
#### exclude(**kwargs) #### exclude(**kwargs)
@ -678,6 +685,13 @@ Returns the contents of the query's `WHERE` clause as a string.
Returns the number of rows after aggregation. Returns the number of rows after aggregation.
#### distinct()
Adds a DISTINCT clause to the query, meaning that any duplicate rows
in the results will be omitted.
#### exclude(**kwargs) #### exclude(**kwargs)

View File

@ -99,6 +99,16 @@ When some of the model fields aren't needed, it is more efficient to omit them f
qs = Person.objects_in(database).only('first_name', 'birthday') qs = Person.objects_in(database).only('first_name', 'birthday')
Distinct
--------
Adds a DISTINCT clause to the query, meaning that any duplicate rows in the results will be omitted.
>>> Person.objects_in(database).only('first_name').count()
100
>>> Person.objects_in(database).only('first_name').distinct().count()
94
Slicing Slicing
------- -------

View File

@ -20,6 +20,7 @@
* [Counting and Checking Existence](querysets.md#counting-and-checking-existence) * [Counting and Checking Existence](querysets.md#counting-and-checking-existence)
* [Ordering](querysets.md#ordering) * [Ordering](querysets.md#ordering)
* [Omitting Fields](querysets.md#omitting-fields) * [Omitting Fields](querysets.md#omitting-fields)
* [Distinct](querysets.md#distinct)
* [Slicing](querysets.md#slicing) * [Slicing](querysets.md#slicing)
* [Pagination](querysets.md#pagination) * [Pagination](querysets.md#pagination)
* [Aggregation](querysets.md#aggregation) * [Aggregation](querysets.md#aggregation)

View File

@ -51,7 +51,10 @@ def get_method_sig(method):
for arg in argspec.args: for arg in argspec.args:
default_arg = _get_default_arg(argspec.args, argspec.defaults, arg_index) default_arg = _get_default_arg(argspec.args, argspec.defaults, arg_index)
if default_arg.has_default: if default_arg.has_default:
args.append("%s=%s" % (arg, default_arg.default_value)) val = default_arg.default_value
if isinstance(val, basestring):
val = '"' + val + '"'
args.append("%s=%s" % (arg, val))
else: else:
args.append(arg) args.append(arg)
arg_index += 1 arg_index += 1

View File

@ -187,6 +187,7 @@ class QuerySet(object):
self._q = [] self._q = []
self._fields = [] self._fields = []
self._limits = None self._limits = None
self._distinct = False
def __iter__(self): def __iter__(self):
""" """
@ -228,14 +229,15 @@ class QuerySet(object):
""" """
Returns the whole query as a SQL string. Returns the whole query as a SQL string.
""" """
distinct = 'DISTINCT ' if self._distinct else ''
fields = '*' fields = '*'
if self._fields: if self._fields:
fields = comma_join('`%s`' % field for field in self._fields) fields = comma_join('`%s`' % field for field in self._fields)
ordering = '\nORDER BY ' + self.order_by_as_sql() if self._order_by else '' ordering = '\nORDER BY ' + self.order_by_as_sql() if self._order_by else ''
limit = '\nLIMIT %d, %d' % self._limits if self._limits else '' limit = '\nLIMIT %d, %d' % self._limits if self._limits else ''
params = (fields, self._model_cls.table_name(), params = (distinct, fields, self._model_cls.table_name(),
self.conditions_as_sql(), ordering, limit) self.conditions_as_sql(), ordering, limit)
return u'SELECT %s\nFROM `%s`\nWHERE %s%s%s' % params return u'SELECT %s%s\nFROM `%s`\nWHERE %s%s%s' % params
def order_by_as_sql(self): def order_by_as_sql(self):
""" """
@ -259,6 +261,11 @@ class QuerySet(object):
""" """
Returns the number of matching model instances. Returns the number of matching model instances.
""" """
if self._distinct:
# Use a subquery, since a simple count won't be accurate
sql = u'SELECT count() FROM (%s)' % self.as_sql()
raw = self._database.raw(sql)
return int(raw) if raw else 0
return self._database.count(self._model_cls, self.conditions_as_sql()) return self._database.count(self._model_cls, self.conditions_as_sql())
def order_by(self, *field_names): def order_by(self, *field_names):
@ -296,7 +303,7 @@ class QuerySet(object):
return qs return qs
def paginate(self, page_num=1, page_size=100): def paginate(self, page_num=1, page_size=100):
''' """
Returns a single page of model instances that match the queryset. Returns a single page of model instances that match the queryset.
Note that `order_by` should be used first, to ensure a correct Note that `order_by` should be used first, to ensure a correct
partitioning of records into pages. partitioning of records into pages.
@ -306,7 +313,7 @@ class QuerySet(object):
The result is a namedtuple containing `objects` (list), `number_of_objects`, The result is a namedtuple containing `objects` (list), `number_of_objects`,
`pages_total`, `number` (of the current page), and `page_size`. `pages_total`, `number` (of the current page), and `page_size`.
''' """
from .database import Page from .database import Page
count = self.count() count = self.count()
pages_total = int(ceil(count / float(page_size))) pages_total = int(ceil(count / float(page_size)))
@ -323,8 +330,17 @@ class QuerySet(object):
page_size=page_size page_size=page_size
) )
def distinct(self):
"""
Adds a DISTINCT clause to the query, meaning that any duplicate rows
in the results will be omitted.
"""
qs = copy(self)
qs._distinct = True
return qs
def aggregate(self, *args, **kwargs): def aggregate(self, *args, **kwargs):
''' """
Returns an `AggregateQuerySet` over this query, with `args` serving as Returns an `AggregateQuerySet` over this query, with `args` serving as
grouping fields and `kwargs` serving as calculated fields. At least one grouping fields and `kwargs` serving as calculated fields. At least one
calculated field is required. For example: calculated field is required. For example:
@ -337,7 +353,7 @@ class QuerySet(object):
WHERE data > '2017-08-01' WHERE data > '2017-08-01'
GROUP BY event_type GROUP BY event_type
``` ```
''' """
return AggregateQuerySet(self, args, kwargs) return AggregateQuerySet(self, args, kwargs)
@ -368,6 +384,7 @@ class AggregateQuerySet(QuerySet):
self._order_by = list(base_qs._order_by) self._order_by = list(base_qs._order_by)
self._q = list(base_qs._q) self._q = list(base_qs._q)
self._limits = base_qs._limits self._limits = base_qs._limits
self._distinct = base_qs._distinct
def group_by(self, *args): def group_by(self, *args):
""" """
@ -398,15 +415,17 @@ class AggregateQuerySet(QuerySet):
""" """
Returns the whole query as a SQL string. Returns the whole query as a SQL string.
""" """
distinct = 'DISTINCT ' if self._distinct else ''
grouping = comma_join('`%s`' % field for field in self._grouping_fields) grouping = comma_join('`%s`' % field for field in self._grouping_fields)
fields = comma_join(list(self._fields) + ['%s AS %s' % (v, k) for k, v in self._calculated_fields.items()]) fields = comma_join(list(self._fields) + ['%s AS %s' % (v, k) for k, v in self._calculated_fields.items()])
params = dict( params = dict(
distinct=distinct,
grouping=grouping or "''", grouping=grouping or "''",
fields=fields, fields=fields,
table=self._model_cls.table_name(), table=self._model_cls.table_name(),
conds=self.conditions_as_sql() conds=self.conditions_as_sql()
) )
sql = u'SELECT %(fields)s\nFROM `%(table)s`\nWHERE %(conds)s\nGROUP BY %(grouping)s' % params sql = u'SELECT %(distinct)s%(fields)s\nFROM `%(table)s`\nWHERE %(conds)s\nGROUP BY %(grouping)s' % params
if self._order_by: if self._order_by:
sql += '\nORDER BY ' + self.order_by_as_sql() sql += '\nORDER BY ' + self.order_by_as_sql()
if self._limits: if self._limits:

View File

@ -21,8 +21,11 @@ class QuerySetTestCase(TestCaseWithData):
def _test_qs(self, qs, expected_count): def _test_qs(self, qs, expected_count):
logging.info(qs.as_sql()) logging.info(qs.as_sql())
count = 0
for instance in qs: for instance in qs:
logging.info('\t%s' % instance.to_dict()) count += 1
logging.info('\t[%d]\t%s' % (count, instance.to_dict()))
self.assertEquals(count, expected_count)
self.assertEquals(qs.count(), expected_count) self.assertEquals(qs.count(), expected_count)
def test_no_filtering(self): def test_no_filtering(self):
@ -202,6 +205,11 @@ class QuerySetTestCase(TestCaseWithData):
page = qs.paginate(1, 100) page = qs.paginate(1, 100)
self.assertEquals(page.number_of_objects, 10) self.assertEquals(page.number_of_objects, 10)
def test_distinct(self):
qs = Person.objects_in(self.database).distinct()
self._test_qs(qs, 100)
self._test_qs(qs.only('first_name'), 94)
class AggregateTestCase(TestCaseWithData): class AggregateTestCase(TestCaseWithData):
@ -310,6 +318,12 @@ class AggregateTestCase(TestCaseWithData):
qs = qs.filter(weekday=1) qs = qs.filter(weekday=1)
self.assertEquals(qs.count(), 1) self.assertEquals(qs.count(), 1)
def test_aggregate_with_distinct(self):
# In this case distinct has no effect
qs = Person.objects_in(self.database).aggregate(average_height='avg(height)').distinct()
print(qs.as_sql())
self.assertEquals(qs.count(), 1)
Color = Enum('Color', u'red blue green yellow brown white black') Color = Enum('Color', u'red blue green yellow brown white black')