From 841340acd46ce50569d14634e56a9d4726a956c6 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sun, 10 Sep 2017 15:46:55 +0300 Subject: [PATCH 1/6] Add `AlterTableWithBuffer` migration operation --- CHANGELOG.md | 4 ++ docs/schema_migrations.md | 13 ++++++- src/infi/clickhouse_orm/migrations.py | 21 +++++++++- tests/sample_migrations/0010.py | 6 +++ tests/sample_migrations/0011.py | 6 +++ tests/test_migrations.py | 56 ++++++++++++++++++++++++++- 6 files changed, 102 insertions(+), 4 deletions(-) create mode 100644 tests/sample_migrations/0010.py create mode 100644 tests/sample_migrations/0011.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d302dc4..1bfa0fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ Change Log ========== +Unreleased +---------- +- Add `AlterTableWithBuffer` migration operation + v0.9.6 ------ - Fix python3 compatibility (TvoroG) diff --git a/docs/schema_migrations.md b/docs/schema_migrations.md index e4647bf..ce56b04 100644 --- a/docs/schema_migrations.md +++ b/docs/schema_migrations.md @@ -34,11 +34,13 @@ The following operations are supported: **CreateTable** -A migration operation that creates a table for a given model class. +A migration operation that creates a table for a given model class. If the table already exists, the operation does nothing. + +In case the model class is a `BufferModel`, the operation first creates the underlying on-disk table, and then creates the buffer table. **DropTable** -A migration operation that drops the table of a given model class. +A migration operation that drops the table of a given model class. If the table does not exist, the operation does nothing. **AlterTable** @@ -50,6 +52,13 @@ A migration operation that compares the table of a given model class to the mode Default values are not altered by this operation. +**AlterTableWithBuffer** + +A compound migration operation for altering a buffer table and its underlying on-disk table. The buffer table is dropped, the on-disk table is altered, and then the buffer table is re-created. This is the procedure recommended in the ClickHouse documentation for handling scenarios in which the underlying table needs to be modified. + +Applying this migration operation to a regular table has the same effect as an `AlterTable` operation. + + Running Migrations ------------------ diff --git a/src/infi/clickhouse_orm/migrations.py b/src/infi/clickhouse_orm/migrations.py index 1a82a5b..a7843a7 100644 --- a/src/infi/clickhouse_orm/migrations.py +++ b/src/infi/clickhouse_orm/migrations.py @@ -82,6 +82,25 @@ class AlterTable(Operation): self._alter_table(database, 'MODIFY COLUMN %s %s' % model_field) +class AlterTableWithBuffer(Operation): + ''' + A migration operation for altering a buffer table and its underlying on-disk table. + The buffer table is dropped, the on-disk table is altered, and then the buffer table + is re-created. + ''' + + def __init__(self, model_class): + self.model_class = model_class + + def apply(self, database): + if issubclass(self.model_class, BufferModel): + DropTable(self.model_class).apply(database) + AlterTable(self.model_class.engine.main_model).apply(database) + CreateTable(self.model_class).apply(database) + else: + AlterTable(self.model_class).apply(database) + + class DropTable(Operation): ''' A migration operation that drops the table of a given model class. @@ -91,7 +110,7 @@ class DropTable(Operation): self.model_class = model_class def apply(self, database): - logger.info(' Drop table %s', self.model_class.__name__) + logger.info(' Drop table %s', self.model_class.table_name()) database.drop_table(self.model_class) diff --git a/tests/sample_migrations/0010.py b/tests/sample_migrations/0010.py new file mode 100644 index 0000000..3892583 --- /dev/null +++ b/tests/sample_migrations/0010.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.CreateTable(Model4Buffer) +] diff --git a/tests/sample_migrations/0011.py b/tests/sample_migrations/0011.py new file mode 100644 index 0000000..dd9d09e --- /dev/null +++ b/tests/sample_migrations/0011.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.AlterTableWithBuffer(Model4Buffer_changed) +] diff --git a/tests/test_migrations.py b/tests/test_migrations.py index 3478f9f..7e31c84 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import unittest from infi.clickhouse_orm.database import Database -from infi.clickhouse_orm.models import Model +from infi.clickhouse_orm.models import Model, BufferModel from infi.clickhouse_orm.fields import * from infi.clickhouse_orm.engines import * from infi.clickhouse_orm.migrations import MigrationHistory @@ -61,6 +61,7 @@ class MigrationsTestCase(unittest.TestCase): self.assertTrue(self.tableExists(EnumModel1)) self.assertEquals(self.getTableFields(EnumModel2), [('date', 'Date'), ('f1', "Enum16('dog' = 1, 'cat' = 2, 'horse' = 3, 'pig' = 4)")]) + # Materialized fields and alias fields self.database.migrate('tests.sample_migrations', 8) self.assertTrue(self.tableExists(MaterializedModel)) self.assertEquals(self.getTableFields(MaterializedModel), @@ -69,6 +70,15 @@ class MigrationsTestCase(unittest.TestCase): self.assertTrue(self.tableExists(AliasModel)) self.assertEquals(self.getTableFields(AliasModel), [('date', 'Date'), ('date_alias', "Date")]) + # Buffer models creation and alteration + self.database.migrate('tests.sample_migrations', 10) + self.assertTrue(self.tableExists(Model4)) + self.assertTrue(self.tableExists(Model4Buffer)) + self.assertEquals(self.getTableFields(Model4), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) + self.assertEquals(self.getTableFields(Model4Buffer), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) + self.database.migrate('tests.sample_migrations', 11) + self.assertEquals(self.getTableFields(Model4), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')]) + self.assertEquals(self.getTableFields(Model4Buffer), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')]) # Several different models with the same table name, to simulate a table that changes over time @@ -159,3 +169,47 @@ class AliasModel(Model): @classmethod def table_name(cls): return 'alias_date' + + +class Model4(Model): + + date = DateField() + f1 = Int32Field() + f2 = StringField() + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'model4' + + +class Model4Buffer(BufferModel, Model4): + + engine = Buffer(Model4) + + @classmethod + def table_name(cls): + return 'model4buffer' + + +class Model4_changed(Model): + + date = DateField() + f3 = DateTimeField() + f2 = StringField() + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'model4' + + +class Model4Buffer_changed(BufferModel, Model4_changed): + + engine = Buffer(Model4_changed) + + @classmethod + def table_name(cls): + return 'model4buffer' From 01f91de54b0cc2f69e8688590dcce31054414e56 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sun, 10 Sep 2017 17:17:04 +0300 Subject: [PATCH 2/6] Add `distinct` method to querysets --- CHANGELOG.md | 1 + docs/class_reference.md | 30 +++++++++++++++++++++-------- docs/querysets.md | 10 ++++++++++ docs/toc.md | 1 + scripts/generate_ref.py | 5 ++++- src/infi/clickhouse_orm/query.py | 33 +++++++++++++++++++++++++------- tests/test_querysets.py | 16 +++++++++++++++- 7 files changed, 79 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bfa0fc..8143834 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ Change Log Unreleased ---------- +- Add `distinct` method to querysets - Add `AlterTableWithBuffer` migration operation v0.9.6 diff --git a/docs/class_reference.md b/docs/class_reference.md index 7e4bc74..f5ef191 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -7,7 +7,7 @@ infi.clickhouse_orm.database ### Database -Database instances connect to a specific ClickHouse database for running queries, +Database instances connect to a specific ClickHouse database for running queries, inserting data and other operations. #### Database(db_name, db_url="http://localhost:8123/", username=None, password=None, readonly=False, autocreate=True) @@ -71,7 +71,7 @@ Insert records into the database. Executes schema migrations. -- `migrations_package_name` - fully qualified name of the Python package +- `migrations_package_name` - fully qualified name of the Python package containing the migrations. - `up_to` - number of the last migration to apply. @@ -89,7 +89,7 @@ Selects records and returns a single page of model instances. - `conditions`: optional SQL conditions (contents of the WHERE clause). - `settings`: query settings to send as HTTP GET parameters -The result is a namedtuple containing `objects` (list), `number_of_objects`, +The result is a namedtuple containing `objects` (list), `number_of_objects`, `pages_total`, `number` (of the current page), and `page_size`. @@ -128,7 +128,7 @@ infi.clickhouse_orm.models A base class for ORM models. Each model class represent a ClickHouse table. For example: - + class CPUStats(Model): timestamp = DateTimeField() cpu_id = UInt16Field() @@ -172,7 +172,7 @@ If omitted, it is assumed to be the names of all fields in the model, in order o #### get_database() -Gets the `Database` that this model instance belongs to. +Gets the `Database` that this model instance belongs to. Returns `None` unless the instance was read from the database or written to it. @@ -191,7 +191,7 @@ Returns a `QuerySet` for selecting instances of this model class. #### set_database(db) -Sets the `Database` that this model instance belongs to. +Sets the `Database` that this model instance belongs to. This is done automatically when the instance is read from the database or written to it. @@ -261,7 +261,7 @@ If omitted, it is assumed to be the names of all fields in the model, in order o #### get_database() -Gets the `Database` that this model instance belongs to. +Gets the `Database` that this model instance belongs to. Returns `None` unless the instance was read from the database or written to it. @@ -280,7 +280,7 @@ Returns a `QuerySet` for selecting instances of this model class. #### set_database(db) -Sets the `Database` that this model instance belongs to. +Sets the `Database` that this model instance belongs to. This is done automatically when the instance is read from the database or written to it. @@ -585,6 +585,13 @@ Returns the contents of the query's `WHERE` clause as a string. Returns the number of matching model instances. +#### distinct() + + +Adds a DISTINCT clause to the query, meaning that any duplicate rows +in the results will be omitted. + + #### exclude(**kwargs) @@ -678,6 +685,13 @@ Returns the contents of the query's `WHERE` clause as a string. Returns the number of rows after aggregation. +#### distinct() + + +Adds a DISTINCT clause to the query, meaning that any duplicate rows +in the results will be omitted. + + #### exclude(**kwargs) diff --git a/docs/querysets.md b/docs/querysets.md index 2bbefd9..d27c836 100644 --- a/docs/querysets.md +++ b/docs/querysets.md @@ -99,6 +99,16 @@ When some of the model fields aren't needed, it is more efficient to omit them f qs = Person.objects_in(database).only('first_name', 'birthday') +Distinct +-------- + +Adds a DISTINCT clause to the query, meaning that any duplicate rows in the results will be omitted. + + >>> Person.objects_in(database).only('first_name').count() + 100 + >>> Person.objects_in(database).only('first_name').distinct().count() + 94 + Slicing ------- diff --git a/docs/toc.md b/docs/toc.md index aa5bb3b..99c486d 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -20,6 +20,7 @@ * [Counting and Checking Existence](querysets.md#counting-and-checking-existence) * [Ordering](querysets.md#ordering) * [Omitting Fields](querysets.md#omitting-fields) + * [Distinct](querysets.md#distinct) * [Slicing](querysets.md#slicing) * [Pagination](querysets.md#pagination) * [Aggregation](querysets.md#aggregation) diff --git a/scripts/generate_ref.py b/scripts/generate_ref.py index c35e881..d2863fd 100644 --- a/scripts/generate_ref.py +++ b/scripts/generate_ref.py @@ -51,7 +51,10 @@ def get_method_sig(method): for arg in argspec.args: default_arg = _get_default_arg(argspec.args, argspec.defaults, arg_index) if default_arg.has_default: - args.append("%s=%s" % (arg, default_arg.default_value)) + val = default_arg.default_value + if isinstance(val, basestring): + val = '"' + val + '"' + args.append("%s=%s" % (arg, val)) else: args.append(arg) arg_index += 1 diff --git a/src/infi/clickhouse_orm/query.py b/src/infi/clickhouse_orm/query.py index c1c1dd0..0bf764a 100644 --- a/src/infi/clickhouse_orm/query.py +++ b/src/infi/clickhouse_orm/query.py @@ -187,6 +187,7 @@ class QuerySet(object): self._q = [] self._fields = [] self._limits = None + self._distinct = False def __iter__(self): """ @@ -228,14 +229,15 @@ class QuerySet(object): """ Returns the whole query as a SQL string. """ + distinct = 'DISTINCT ' if self._distinct else '' fields = '*' if self._fields: fields = comma_join('`%s`' % field for field in self._fields) ordering = '\nORDER BY ' + self.order_by_as_sql() if self._order_by else '' limit = '\nLIMIT %d, %d' % self._limits if self._limits else '' - params = (fields, self._model_cls.table_name(), + params = (distinct, fields, self._model_cls.table_name(), self.conditions_as_sql(), ordering, limit) - return u'SELECT %s\nFROM `%s`\nWHERE %s%s%s' % params + return u'SELECT %s%s\nFROM `%s`\nWHERE %s%s%s' % params def order_by_as_sql(self): """ @@ -259,6 +261,11 @@ class QuerySet(object): """ Returns the number of matching model instances. """ + if self._distinct: + # Use a subquery, since a simple count won't be accurate + sql = u'SELECT count() FROM (%s)' % self.as_sql() + raw = self._database.raw(sql) + return int(raw) if raw else 0 return self._database.count(self._model_cls, self.conditions_as_sql()) def order_by(self, *field_names): @@ -296,7 +303,7 @@ class QuerySet(object): return qs def paginate(self, page_num=1, page_size=100): - ''' + """ Returns a single page of model instances that match the queryset. Note that `order_by` should be used first, to ensure a correct partitioning of records into pages. @@ -306,7 +313,7 @@ class QuerySet(object): The result is a namedtuple containing `objects` (list), `number_of_objects`, `pages_total`, `number` (of the current page), and `page_size`. - ''' + """ from .database import Page count = self.count() pages_total = int(ceil(count / float(page_size))) @@ -323,8 +330,17 @@ class QuerySet(object): page_size=page_size ) + def distinct(self): + """ + Adds a DISTINCT clause to the query, meaning that any duplicate rows + in the results will be omitted. + """ + qs = copy(self) + qs._distinct = True + return qs + def aggregate(self, *args, **kwargs): - ''' + """ Returns an `AggregateQuerySet` over this query, with `args` serving as grouping fields and `kwargs` serving as calculated fields. At least one calculated field is required. For example: @@ -337,7 +353,7 @@ class QuerySet(object): WHERE data > '2017-08-01' GROUP BY event_type ``` - ''' + """ return AggregateQuerySet(self, args, kwargs) @@ -368,6 +384,7 @@ class AggregateQuerySet(QuerySet): self._order_by = list(base_qs._order_by) self._q = list(base_qs._q) self._limits = base_qs._limits + self._distinct = base_qs._distinct def group_by(self, *args): """ @@ -398,15 +415,17 @@ class AggregateQuerySet(QuerySet): """ Returns the whole query as a SQL string. """ + distinct = 'DISTINCT ' if self._distinct else '' grouping = comma_join('`%s`' % field for field in self._grouping_fields) fields = comma_join(list(self._fields) + ['%s AS %s' % (v, k) for k, v in self._calculated_fields.items()]) params = dict( + distinct=distinct, grouping=grouping or "''", fields=fields, table=self._model_cls.table_name(), conds=self.conditions_as_sql() ) - sql = u'SELECT %(fields)s\nFROM `%(table)s`\nWHERE %(conds)s\nGROUP BY %(grouping)s' % params + sql = u'SELECT %(distinct)s%(fields)s\nFROM `%(table)s`\nWHERE %(conds)s\nGROUP BY %(grouping)s' % params if self._order_by: sql += '\nORDER BY ' + self.order_by_as_sql() if self._limits: diff --git a/tests/test_querysets.py b/tests/test_querysets.py index ad834bb..cbbc65d 100644 --- a/tests/test_querysets.py +++ b/tests/test_querysets.py @@ -21,8 +21,11 @@ class QuerySetTestCase(TestCaseWithData): def _test_qs(self, qs, expected_count): logging.info(qs.as_sql()) + count = 0 for instance in qs: - logging.info('\t%s' % instance.to_dict()) + count += 1 + logging.info('\t[%d]\t%s' % (count, instance.to_dict())) + self.assertEquals(count, expected_count) self.assertEquals(qs.count(), expected_count) def test_no_filtering(self): @@ -202,6 +205,11 @@ class QuerySetTestCase(TestCaseWithData): page = qs.paginate(1, 100) self.assertEquals(page.number_of_objects, 10) + def test_distinct(self): + qs = Person.objects_in(self.database).distinct() + self._test_qs(qs, 100) + self._test_qs(qs.only('first_name'), 94) + class AggregateTestCase(TestCaseWithData): @@ -310,6 +318,12 @@ class AggregateTestCase(TestCaseWithData): qs = qs.filter(weekday=1) self.assertEquals(qs.count(), 1) + def test_aggregate_with_distinct(self): + # In this case distinct has no effect + qs = Person.objects_in(self.database).aggregate(average_height='avg(height)').distinct() + print(qs.as_sql()) + self.assertEquals(qs.count(), 1) + Color = Enum('Color', u'red blue green yellow brown white black') From 7bbcae574a12c6e316a904ff44f3b4a73b3c8c34 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sun, 10 Sep 2017 15:46:55 +0300 Subject: [PATCH 3/6] Add `AlterTableWithBuffer` migration operation --- CHANGELOG.md | 4 ++ docs/schema_migrations.md | 13 ++++++- src/infi/clickhouse_orm/migrations.py | 21 +++++++++- tests/sample_migrations/0010.py | 6 +++ tests/sample_migrations/0011.py | 6 +++ tests/test_migrations.py | 56 ++++++++++++++++++++++++++- 6 files changed, 102 insertions(+), 4 deletions(-) create mode 100644 tests/sample_migrations/0010.py create mode 100644 tests/sample_migrations/0011.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d302dc4..1bfa0fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ Change Log ========== +Unreleased +---------- +- Add `AlterTableWithBuffer` migration operation + v0.9.6 ------ - Fix python3 compatibility (TvoroG) diff --git a/docs/schema_migrations.md b/docs/schema_migrations.md index e4647bf..ce56b04 100644 --- a/docs/schema_migrations.md +++ b/docs/schema_migrations.md @@ -34,11 +34,13 @@ The following operations are supported: **CreateTable** -A migration operation that creates a table for a given model class. +A migration operation that creates a table for a given model class. If the table already exists, the operation does nothing. + +In case the model class is a `BufferModel`, the operation first creates the underlying on-disk table, and then creates the buffer table. **DropTable** -A migration operation that drops the table of a given model class. +A migration operation that drops the table of a given model class. If the table does not exist, the operation does nothing. **AlterTable** @@ -50,6 +52,13 @@ A migration operation that compares the table of a given model class to the mode Default values are not altered by this operation. +**AlterTableWithBuffer** + +A compound migration operation for altering a buffer table and its underlying on-disk table. The buffer table is dropped, the on-disk table is altered, and then the buffer table is re-created. This is the procedure recommended in the ClickHouse documentation for handling scenarios in which the underlying table needs to be modified. + +Applying this migration operation to a regular table has the same effect as an `AlterTable` operation. + + Running Migrations ------------------ diff --git a/src/infi/clickhouse_orm/migrations.py b/src/infi/clickhouse_orm/migrations.py index 1a82a5b..a7843a7 100644 --- a/src/infi/clickhouse_orm/migrations.py +++ b/src/infi/clickhouse_orm/migrations.py @@ -82,6 +82,25 @@ class AlterTable(Operation): self._alter_table(database, 'MODIFY COLUMN %s %s' % model_field) +class AlterTableWithBuffer(Operation): + ''' + A migration operation for altering a buffer table and its underlying on-disk table. + The buffer table is dropped, the on-disk table is altered, and then the buffer table + is re-created. + ''' + + def __init__(self, model_class): + self.model_class = model_class + + def apply(self, database): + if issubclass(self.model_class, BufferModel): + DropTable(self.model_class).apply(database) + AlterTable(self.model_class.engine.main_model).apply(database) + CreateTable(self.model_class).apply(database) + else: + AlterTable(self.model_class).apply(database) + + class DropTable(Operation): ''' A migration operation that drops the table of a given model class. @@ -91,7 +110,7 @@ class DropTable(Operation): self.model_class = model_class def apply(self, database): - logger.info(' Drop table %s', self.model_class.__name__) + logger.info(' Drop table %s', self.model_class.table_name()) database.drop_table(self.model_class) diff --git a/tests/sample_migrations/0010.py b/tests/sample_migrations/0010.py new file mode 100644 index 0000000..3892583 --- /dev/null +++ b/tests/sample_migrations/0010.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.CreateTable(Model4Buffer) +] diff --git a/tests/sample_migrations/0011.py b/tests/sample_migrations/0011.py new file mode 100644 index 0000000..dd9d09e --- /dev/null +++ b/tests/sample_migrations/0011.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.AlterTableWithBuffer(Model4Buffer_changed) +] diff --git a/tests/test_migrations.py b/tests/test_migrations.py index 3478f9f..7e31c84 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import unittest from infi.clickhouse_orm.database import Database -from infi.clickhouse_orm.models import Model +from infi.clickhouse_orm.models import Model, BufferModel from infi.clickhouse_orm.fields import * from infi.clickhouse_orm.engines import * from infi.clickhouse_orm.migrations import MigrationHistory @@ -61,6 +61,7 @@ class MigrationsTestCase(unittest.TestCase): self.assertTrue(self.tableExists(EnumModel1)) self.assertEquals(self.getTableFields(EnumModel2), [('date', 'Date'), ('f1', "Enum16('dog' = 1, 'cat' = 2, 'horse' = 3, 'pig' = 4)")]) + # Materialized fields and alias fields self.database.migrate('tests.sample_migrations', 8) self.assertTrue(self.tableExists(MaterializedModel)) self.assertEquals(self.getTableFields(MaterializedModel), @@ -69,6 +70,15 @@ class MigrationsTestCase(unittest.TestCase): self.assertTrue(self.tableExists(AliasModel)) self.assertEquals(self.getTableFields(AliasModel), [('date', 'Date'), ('date_alias', "Date")]) + # Buffer models creation and alteration + self.database.migrate('tests.sample_migrations', 10) + self.assertTrue(self.tableExists(Model4)) + self.assertTrue(self.tableExists(Model4Buffer)) + self.assertEquals(self.getTableFields(Model4), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) + self.assertEquals(self.getTableFields(Model4Buffer), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) + self.database.migrate('tests.sample_migrations', 11) + self.assertEquals(self.getTableFields(Model4), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')]) + self.assertEquals(self.getTableFields(Model4Buffer), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')]) # Several different models with the same table name, to simulate a table that changes over time @@ -159,3 +169,47 @@ class AliasModel(Model): @classmethod def table_name(cls): return 'alias_date' + + +class Model4(Model): + + date = DateField() + f1 = Int32Field() + f2 = StringField() + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'model4' + + +class Model4Buffer(BufferModel, Model4): + + engine = Buffer(Model4) + + @classmethod + def table_name(cls): + return 'model4buffer' + + +class Model4_changed(Model): + + date = DateField() + f3 = DateTimeField() + f2 = StringField() + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'model4' + + +class Model4Buffer_changed(BufferModel, Model4_changed): + + engine = Buffer(Model4_changed) + + @classmethod + def table_name(cls): + return 'model4buffer' From 59564f8c70f614fba2ba900eb0ada8a784db8248 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sun, 10 Sep 2017 17:17:04 +0300 Subject: [PATCH 4/6] Add `distinct` method to querysets --- CHANGELOG.md | 1 + docs/class_reference.md | 30 +++++++++++++++++++++-------- docs/querysets.md | 10 ++++++++++ docs/toc.md | 1 + scripts/generate_ref.py | 5 ++++- src/infi/clickhouse_orm/query.py | 33 +++++++++++++++++++++++++------- tests/test_querysets.py | 16 +++++++++++++++- 7 files changed, 79 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bfa0fc..8143834 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ Change Log Unreleased ---------- +- Add `distinct` method to querysets - Add `AlterTableWithBuffer` migration operation v0.9.6 diff --git a/docs/class_reference.md b/docs/class_reference.md index 7e4bc74..f5ef191 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -7,7 +7,7 @@ infi.clickhouse_orm.database ### Database -Database instances connect to a specific ClickHouse database for running queries, +Database instances connect to a specific ClickHouse database for running queries, inserting data and other operations. #### Database(db_name, db_url="http://localhost:8123/", username=None, password=None, readonly=False, autocreate=True) @@ -71,7 +71,7 @@ Insert records into the database. Executes schema migrations. -- `migrations_package_name` - fully qualified name of the Python package +- `migrations_package_name` - fully qualified name of the Python package containing the migrations. - `up_to` - number of the last migration to apply. @@ -89,7 +89,7 @@ Selects records and returns a single page of model instances. - `conditions`: optional SQL conditions (contents of the WHERE clause). - `settings`: query settings to send as HTTP GET parameters -The result is a namedtuple containing `objects` (list), `number_of_objects`, +The result is a namedtuple containing `objects` (list), `number_of_objects`, `pages_total`, `number` (of the current page), and `page_size`. @@ -128,7 +128,7 @@ infi.clickhouse_orm.models A base class for ORM models. Each model class represent a ClickHouse table. For example: - + class CPUStats(Model): timestamp = DateTimeField() cpu_id = UInt16Field() @@ -172,7 +172,7 @@ If omitted, it is assumed to be the names of all fields in the model, in order o #### get_database() -Gets the `Database` that this model instance belongs to. +Gets the `Database` that this model instance belongs to. Returns `None` unless the instance was read from the database or written to it. @@ -191,7 +191,7 @@ Returns a `QuerySet` for selecting instances of this model class. #### set_database(db) -Sets the `Database` that this model instance belongs to. +Sets the `Database` that this model instance belongs to. This is done automatically when the instance is read from the database or written to it. @@ -261,7 +261,7 @@ If omitted, it is assumed to be the names of all fields in the model, in order o #### get_database() -Gets the `Database` that this model instance belongs to. +Gets the `Database` that this model instance belongs to. Returns `None` unless the instance was read from the database or written to it. @@ -280,7 +280,7 @@ Returns a `QuerySet` for selecting instances of this model class. #### set_database(db) -Sets the `Database` that this model instance belongs to. +Sets the `Database` that this model instance belongs to. This is done automatically when the instance is read from the database or written to it. @@ -585,6 +585,13 @@ Returns the contents of the query's `WHERE` clause as a string. Returns the number of matching model instances. +#### distinct() + + +Adds a DISTINCT clause to the query, meaning that any duplicate rows +in the results will be omitted. + + #### exclude(**kwargs) @@ -678,6 +685,13 @@ Returns the contents of the query's `WHERE` clause as a string. Returns the number of rows after aggregation. +#### distinct() + + +Adds a DISTINCT clause to the query, meaning that any duplicate rows +in the results will be omitted. + + #### exclude(**kwargs) diff --git a/docs/querysets.md b/docs/querysets.md index 2bbefd9..d27c836 100644 --- a/docs/querysets.md +++ b/docs/querysets.md @@ -99,6 +99,16 @@ When some of the model fields aren't needed, it is more efficient to omit them f qs = Person.objects_in(database).only('first_name', 'birthday') +Distinct +-------- + +Adds a DISTINCT clause to the query, meaning that any duplicate rows in the results will be omitted. + + >>> Person.objects_in(database).only('first_name').count() + 100 + >>> Person.objects_in(database).only('first_name').distinct().count() + 94 + Slicing ------- diff --git a/docs/toc.md b/docs/toc.md index 0f83389..f994141 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -20,6 +20,7 @@ * [Counting and Checking Existence](querysets.md#counting-and-checking-existence) * [Ordering](querysets.md#ordering) * [Omitting Fields](querysets.md#omitting-fields) + * [Distinct](querysets.md#distinct) * [Slicing](querysets.md#slicing) * [Pagination](querysets.md#pagination) * [Aggregation](querysets.md#aggregation) diff --git a/scripts/generate_ref.py b/scripts/generate_ref.py index c35e881..d2863fd 100644 --- a/scripts/generate_ref.py +++ b/scripts/generate_ref.py @@ -51,7 +51,10 @@ def get_method_sig(method): for arg in argspec.args: default_arg = _get_default_arg(argspec.args, argspec.defaults, arg_index) if default_arg.has_default: - args.append("%s=%s" % (arg, default_arg.default_value)) + val = default_arg.default_value + if isinstance(val, basestring): + val = '"' + val + '"' + args.append("%s=%s" % (arg, val)) else: args.append(arg) arg_index += 1 diff --git a/src/infi/clickhouse_orm/query.py b/src/infi/clickhouse_orm/query.py index c1c1dd0..0bf764a 100644 --- a/src/infi/clickhouse_orm/query.py +++ b/src/infi/clickhouse_orm/query.py @@ -187,6 +187,7 @@ class QuerySet(object): self._q = [] self._fields = [] self._limits = None + self._distinct = False def __iter__(self): """ @@ -228,14 +229,15 @@ class QuerySet(object): """ Returns the whole query as a SQL string. """ + distinct = 'DISTINCT ' if self._distinct else '' fields = '*' if self._fields: fields = comma_join('`%s`' % field for field in self._fields) ordering = '\nORDER BY ' + self.order_by_as_sql() if self._order_by else '' limit = '\nLIMIT %d, %d' % self._limits if self._limits else '' - params = (fields, self._model_cls.table_name(), + params = (distinct, fields, self._model_cls.table_name(), self.conditions_as_sql(), ordering, limit) - return u'SELECT %s\nFROM `%s`\nWHERE %s%s%s' % params + return u'SELECT %s%s\nFROM `%s`\nWHERE %s%s%s' % params def order_by_as_sql(self): """ @@ -259,6 +261,11 @@ class QuerySet(object): """ Returns the number of matching model instances. """ + if self._distinct: + # Use a subquery, since a simple count won't be accurate + sql = u'SELECT count() FROM (%s)' % self.as_sql() + raw = self._database.raw(sql) + return int(raw) if raw else 0 return self._database.count(self._model_cls, self.conditions_as_sql()) def order_by(self, *field_names): @@ -296,7 +303,7 @@ class QuerySet(object): return qs def paginate(self, page_num=1, page_size=100): - ''' + """ Returns a single page of model instances that match the queryset. Note that `order_by` should be used first, to ensure a correct partitioning of records into pages. @@ -306,7 +313,7 @@ class QuerySet(object): The result is a namedtuple containing `objects` (list), `number_of_objects`, `pages_total`, `number` (of the current page), and `page_size`. - ''' + """ from .database import Page count = self.count() pages_total = int(ceil(count / float(page_size))) @@ -323,8 +330,17 @@ class QuerySet(object): page_size=page_size ) + def distinct(self): + """ + Adds a DISTINCT clause to the query, meaning that any duplicate rows + in the results will be omitted. + """ + qs = copy(self) + qs._distinct = True + return qs + def aggregate(self, *args, **kwargs): - ''' + """ Returns an `AggregateQuerySet` over this query, with `args` serving as grouping fields and `kwargs` serving as calculated fields. At least one calculated field is required. For example: @@ -337,7 +353,7 @@ class QuerySet(object): WHERE data > '2017-08-01' GROUP BY event_type ``` - ''' + """ return AggregateQuerySet(self, args, kwargs) @@ -368,6 +384,7 @@ class AggregateQuerySet(QuerySet): self._order_by = list(base_qs._order_by) self._q = list(base_qs._q) self._limits = base_qs._limits + self._distinct = base_qs._distinct def group_by(self, *args): """ @@ -398,15 +415,17 @@ class AggregateQuerySet(QuerySet): """ Returns the whole query as a SQL string. """ + distinct = 'DISTINCT ' if self._distinct else '' grouping = comma_join('`%s`' % field for field in self._grouping_fields) fields = comma_join(list(self._fields) + ['%s AS %s' % (v, k) for k, v in self._calculated_fields.items()]) params = dict( + distinct=distinct, grouping=grouping or "''", fields=fields, table=self._model_cls.table_name(), conds=self.conditions_as_sql() ) - sql = u'SELECT %(fields)s\nFROM `%(table)s`\nWHERE %(conds)s\nGROUP BY %(grouping)s' % params + sql = u'SELECT %(distinct)s%(fields)s\nFROM `%(table)s`\nWHERE %(conds)s\nGROUP BY %(grouping)s' % params if self._order_by: sql += '\nORDER BY ' + self.order_by_as_sql() if self._limits: diff --git a/tests/test_querysets.py b/tests/test_querysets.py index ad834bb..cbbc65d 100644 --- a/tests/test_querysets.py +++ b/tests/test_querysets.py @@ -21,8 +21,11 @@ class QuerySetTestCase(TestCaseWithData): def _test_qs(self, qs, expected_count): logging.info(qs.as_sql()) + count = 0 for instance in qs: - logging.info('\t%s' % instance.to_dict()) + count += 1 + logging.info('\t[%d]\t%s' % (count, instance.to_dict())) + self.assertEquals(count, expected_count) self.assertEquals(qs.count(), expected_count) def test_no_filtering(self): @@ -202,6 +205,11 @@ class QuerySetTestCase(TestCaseWithData): page = qs.paginate(1, 100) self.assertEquals(page.number_of_objects, 10) + def test_distinct(self): + qs = Person.objects_in(self.database).distinct() + self._test_qs(qs, 100) + self._test_qs(qs.only('first_name'), 94) + class AggregateTestCase(TestCaseWithData): @@ -310,6 +318,12 @@ class AggregateTestCase(TestCaseWithData): qs = qs.filter(weekday=1) self.assertEquals(qs.count(), 1) + def test_aggregate_with_distinct(self): + # In this case distinct has no effect + qs = Person.objects_in(self.database).aggregate(average_height='avg(height)').distinct() + print(qs.as_sql()) + self.assertEquals(qs.count(), 1) + Color = Enum('Color', u'red blue green yellow brown white black') From 8304ddca5ca3e44f9243bf9d332e12d195b183b1 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Wed, 13 Sep 2017 12:15:44 +0300 Subject: [PATCH 5/6] Update docs --- docs/class_reference.md | 55 +++++++++++++++++++------------ docs/toc.md | 1 + src/infi/clickhouse_orm/fields.py | 2 +- 3 files changed, 36 insertions(+), 22 deletions(-) diff --git a/docs/class_reference.md b/docs/class_reference.md index f5ef191..e56fc7a 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -317,28 +317,28 @@ infi.clickhouse_orm.fields Abstract base class for all field types. -#### Field(default=None, alias=None, materialized=None) +#### Field(default=None, alias=None, materialized=None, readonly=None) ### StringField Extends Field -#### StringField(default=None, alias=None, materialized=None) +#### StringField(default=None, alias=None, materialized=None, readonly=None) ### DateField Extends Field -#### DateField(default=None, alias=None, materialized=None) +#### DateField(default=None, alias=None, materialized=None, readonly=None) ### DateTimeField Extends Field -#### DateTimeField(default=None, alias=None, materialized=None) +#### DateTimeField(default=None, alias=None, materialized=None, readonly=None) ### BaseIntField @@ -348,7 +348,7 @@ Extends Field Abstract base class for all integer-type fields. -#### BaseIntField(default=None, alias=None, materialized=None) +#### BaseIntField(default=None, alias=None, materialized=None, readonly=None) ### BaseFloatField @@ -358,7 +358,7 @@ Extends Field Abstract base class for all float-type fields. -#### BaseFloatField(default=None, alias=None, materialized=None) +#### BaseFloatField(default=None, alias=None, materialized=None, readonly=None) ### BaseEnumField @@ -368,14 +368,14 @@ Extends Field Abstract base class for all enum-type fields. -#### BaseEnumField(enum_cls, default=None, alias=None, materialized=None) +#### BaseEnumField(enum_cls, default=None, alias=None, materialized=None, readonly=None) ### ArrayField Extends Field -#### ArrayField(inner_field, default=None, alias=None, materialized=None) +#### ArrayField(inner_field, default=None, alias=None, materialized=None, readonly=None) ### NullableField @@ -389,91 +389,91 @@ Extends Field Extends StringField -#### FixedStringField(length, default=None, alias=None, materialized=None) +#### FixedStringField(length, default=None, alias=None, materialized=None, readonly=None) ### UInt8Field Extends BaseIntField -#### UInt8Field(default=None, alias=None, materialized=None) +#### UInt8Field(default=None, alias=None, materialized=None, readonly=None) ### UInt16Field Extends BaseIntField -#### UInt16Field(default=None, alias=None, materialized=None) +#### UInt16Field(default=None, alias=None, materialized=None, readonly=None) ### UInt32Field Extends BaseIntField -#### UInt32Field(default=None, alias=None, materialized=None) +#### UInt32Field(default=None, alias=None, materialized=None, readonly=None) ### UInt64Field Extends BaseIntField -#### UInt64Field(default=None, alias=None, materialized=None) +#### UInt64Field(default=None, alias=None, materialized=None, readonly=None) ### Int8Field Extends BaseIntField -#### Int8Field(default=None, alias=None, materialized=None) +#### Int8Field(default=None, alias=None, materialized=None, readonly=None) ### Int16Field Extends BaseIntField -#### Int16Field(default=None, alias=None, materialized=None) +#### Int16Field(default=None, alias=None, materialized=None, readonly=None) ### Int32Field Extends BaseIntField -#### Int32Field(default=None, alias=None, materialized=None) +#### Int32Field(default=None, alias=None, materialized=None, readonly=None) ### Int64Field Extends BaseIntField -#### Int64Field(default=None, alias=None, materialized=None) +#### Int64Field(default=None, alias=None, materialized=None, readonly=None) ### Float32Field Extends BaseFloatField -#### Float32Field(default=None, alias=None, materialized=None) +#### Float32Field(default=None, alias=None, materialized=None, readonly=None) ### Float64Field Extends BaseFloatField -#### Float64Field(default=None, alias=None, materialized=None) +#### Float64Field(default=None, alias=None, materialized=None, readonly=None) ### Enum8Field Extends BaseEnumField -#### Enum8Field(enum_cls, default=None, alias=None, materialized=None) +#### Enum8Field(enum_cls, default=None, alias=None, materialized=None, readonly=None) ### Enum16Field Extends BaseEnumField -#### Enum16Field(enum_cls, default=None, alias=None, materialized=None) +#### Enum16Field(enum_cls, default=None, alias=None, materialized=None, readonly=None) infi.clickhouse_orm.engines @@ -512,6 +512,19 @@ Read more [here](https://clickhouse.yandex/reference_en.html#Buffer). #### Buffer(main_model, num_layers=16, min_time=10, max_time=100, min_rows=10000, max_rows=1000000, min_bytes=10000000, max_bytes=100000000) +### Merge + +Extends Engine + + +The Merge engine (not to be confused with MergeTree) does not store data itself, +but allows reading from any number of other tables simultaneously. +Writing to a table is not supported +https://clickhouse.yandex/docs/en/single/index.html#document-table_engines/merge + +#### Merge(table_regex) + + ### CollapsingMergeTree Extends MergeTree diff --git a/docs/toc.md b/docs/toc.md index f994141..cc1c8a6 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -87,6 +87,7 @@ * [Memory](class_reference.md#memory) * [MergeTree](class_reference.md#mergetree) * [Buffer](class_reference.md#buffer) + * [Merge](class_reference.md#merge) * [CollapsingMergeTree](class_reference.md#collapsingmergetree) * [SummingMergeTree](class_reference.md#summingmergetree) * [ReplacingMergeTree](class_reference.md#replacingmergetree) diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index a114db3..3e3207a 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -96,7 +96,7 @@ class FixedStringField(StringField): def __init__(self, length, default=None, alias=None, materialized=None, readonly=None): self._length = length self.db_type = 'FixedString(%d)' % length - super(FixedStringField, self).__init__(default, alias, materialized,readonly) + super(FixedStringField, self).__init__(default, alias, materialized, readonly) def to_python(self, value, timezone_in_use): value = super(FixedStringField, self).to_python(value, timezone_in_use) From 77fe6704d8bb0dbf4a03295778131d174ceee8cf Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Wed, 13 Sep 2017 12:24:51 +0300 Subject: [PATCH 6/6] Releasing v0.9.7 --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8143834..a32d9d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,11 @@ Change Log ========== -Unreleased ----------- +v0.9.7 +------ - Add `distinct` method to querysets - Add `AlterTableWithBuffer` migration operation +- Support Merge engine (M1hacka) v0.9.6 ------