From 508c959d2c035ae804c6d1b2504cea01b96fe30f Mon Sep 17 00:00:00 2001 From: M1ha Date: Tue, 22 Aug 2017 13:25:40 +0500 Subject: [PATCH 01/10] Fixed bug with getting SystemPart info on actual ClickHouse version (1.1.54245) --- src/infi/clickhouse_orm/system_models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/infi/clickhouse_orm/system_models.py b/src/infi/clickhouse_orm/system_models.py index 8c66550..b0dce51 100644 --- a/src/infi/clickhouse_orm/system_models.py +++ b/src/infi/clickhouse_orm/system_models.py @@ -22,7 +22,10 @@ class SystemPart(Model): engine = StringField() # Name of the table engine, without parameters. partition = StringField() # Name of the partition, in the format YYYYMM. name = StringField() # Name of the part. - replicated = UInt8Field() # Whether the part belongs to replicated data. + + # This field is present in the docs (https://clickhouse.yandex/docs/en/single/index.html#system-parts), + # but is absent in ClickHouse (in version 1.1.54245) + # replicated = UInt8Field() # Whether the part belongs to replicated data. # Whether the part is used in a table, or is no longer needed and will be deleted soon. # Inactive parts remain after merging. From de9f64cd3a7aa9525d4a18c20dc1c7b74d61fe87 Mon Sep 17 00:00:00 2001 From: M1ha Date: Thu, 7 Sep 2017 17:44:27 +0500 Subject: [PATCH 02/10] Added Merge engine 1) Divided readonly and system flags of Field model. Readonly flag only restricts insert operations, while system flag restricts also create and drop table operations 2) Added Merge engine and tests for it 3) Added docs for Merge engine 4) Added opportunity to make Field readonly. This is useful for "virtual" columns (https://clickhouse.yandex/docs/en/single/index.html#virtual-columns) --- docs/table_engines.md | 13 ++++++ docs/toc.md | 1 + src/infi/clickhouse_orm/database.py | 12 +++--- src/infi/clickhouse_orm/engines.py | 28 ++++++++++++- src/infi/clickhouse_orm/fields.py | 8 ++-- src/infi/clickhouse_orm/models.py | 30 +++++++++++++- src/infi/clickhouse_orm/system_models.py | 1 + tests/test_engines.py | 52 +++++++++++++++++++++++- tests/test_readonly.py | 15 +++---- tests/test_system_models.py | 28 ++++++++++++- 10 files changed, 162 insertions(+), 26 deletions(-) diff --git a/docs/table_engines.md b/docs/table_engines.md index 2f92183..30aa07b 100644 --- a/docs/table_engines.md +++ b/docs/table_engines.md @@ -15,6 +15,7 @@ The following engines are supported by the ORM: - SummingMergeTree / ReplicatedSummingMergeTree - ReplacingMergeTree / ReplicatedReplacingMergeTree - Buffer +- Merge Simple Engines @@ -85,6 +86,18 @@ Then you can insert objects into Buffer model and they will be handled by ClickH suzy = PersonBuffer(first_name='Suzy', last_name='Jones') dan = PersonBuffer(first_name='Dan', last_name='Schwartz') db.insert([dan, suzy]) + + +Merge Engine +------------- + +[ClickHouse docs](https://clickhouse.yandex/docs/en/single/index.html#merge) +A `Merge` engine is only used in conjunction with a `MergeModel`. +This table does not store data itself, but allows reading from any number of other tables simultaneously. So you can't insert in it. +Engine parameter specifies re2 (similar to PCRE) regular expression, from which data is selected. + + class MergeTable(models.MergeModel): + engine = engines.Merge('^table_prefix') --- diff --git a/docs/toc.md b/docs/toc.md index aa5bb3b..0f83389 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -36,6 +36,7 @@ * [Engines in the MergeTree Family](table_engines.md#engines-in-the-mergetree-family) * [Data Replication](table_engines.md#data-replication) * [Buffer Engine](table_engines.md#buffer-engine) + * [Merge Engine](table_engines.md#merge-engine) * [Schema Migrations](schema_migrations.md#schema-migrations) * [Writing Migrations](schema_migrations.md#writing-migrations) diff --git a/src/infi/clickhouse_orm/database.py b/src/infi/clickhouse_orm/database.py index 5c816d6..4f94b6b 100644 --- a/src/infi/clickhouse_orm/database.py +++ b/src/infi/clickhouse_orm/database.py @@ -75,16 +75,16 @@ class Database(object): Creates a table for the given model class, if it does not exist already. ''' # TODO check that model has an engine - if model_class.readonly: - raise DatabaseException("You can't create read only table") + if model_class.system: + raise DatabaseException("You can't create system table") self._send(model_class.create_table_sql(self.db_name)) def drop_table(self, model_class): ''' Drops the database table of the given model class, if it exists. ''' - if model_class.readonly: - raise DatabaseException("You can't drop read only table") + if model_class.system: + raise DatabaseException("You can't drop system table") self._send(model_class.drop_table_sql(self.db_name)) def insert(self, model_instances, batch_size=1000): @@ -103,8 +103,8 @@ class Database(object): return # model_instances is empty model_class = first_instance.__class__ - if first_instance.readonly: - raise DatabaseException("You can't insert into read only table") + if first_instance.readonly or first_instance.system: + raise DatabaseException("You can't insert into read only and system tables") def gen(): buf = BytesIO() diff --git a/src/infi/clickhouse_orm/engines.py b/src/infi/clickhouse_orm/engines.py index caa05c7..a91fa6c 100644 --- a/src/infi/clickhouse_orm/engines.py +++ b/src/infi/clickhouse_orm/engines.py @@ -1,4 +1,7 @@ from __future__ import unicode_literals + +import six + from .utils import comma_join @@ -118,7 +121,6 @@ class Buffer(Engine): self.min_bytes = min_bytes self.max_bytes = max_bytes - def create_table_sql(self, db_name): # Overriden create_table_sql example: #sql = 'ENGINE = Buffer(merge, hits, 16, 10, 100, 10000, 1000000, 10000000, 100000000)' @@ -128,3 +130,27 @@ class Buffer(Engine): self.max_rows, self.min_bytes, self.max_bytes ) return sql + + +class Merge(Engine): + """ + The Merge engine (not to be confused with MergeTree) does not store data itself, + but allows reading from any number of other tables simultaneously. + Writing to a table is not supported + https://clickhouse.yandex/docs/en/single/index.html#document-table_engines/merge + """ + + def __init__(self, table_regex): + assert isinstance(table_regex, six.string_types), "'db_name' parameter must be string" + + self.table_regex = table_regex + + # Use current database as default + self.db_name = 'currentDatabase()' + + def create_table_sql(self): + return "Merge(%s, '%s')" % (self.db_name, self.table_regex) + + def set_db_name(self, db_name): + assert isinstance(db_name, six.string_types), "'db_name' parameter must be string" + self.db_name = db_name diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index a57314e..0cf2543 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -16,19 +16,21 @@ class Field(object): class_default = 0 db_type = None - def __init__(self, default=None, alias=None, materialized=None): + def __init__(self, default=None, alias=None, materialized=None, readonly=None): assert (None, None) in {(default, alias), (alias, materialized), (default, materialized)}, \ "Only one of default, alias and materialized parameters can be given" assert alias is None or isinstance(alias, string_types) and alias != "",\ "Alias field must be string field name, if given" assert materialized is None or isinstance(materialized, string_types) and alias != "",\ "Materialized field must be string, if given" + assert readonly is None or type(readonly) is bool, "readonly parameter must be bool if given" self.creation_counter = Field.creation_counter Field.creation_counter += 1 self.default = self.class_default if default is None else default self.alias = alias self.materialized = materialized + self.readonly = bool(self.alias or self.materialized or readonly) def to_python(self, value, timezone_in_use): ''' @@ -75,10 +77,6 @@ class Field(object): else: return self.db_type - @property - def readonly(self): - return bool(self.alias or self.materialized) - class StringField(Field): diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index 8714447..33cfde2 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -4,9 +4,10 @@ from logging import getLogger from six import with_metaclass import pytz -from .fields import Field +from .fields import Field, StringField from .utils import parse_tsv from .query import QuerySet +from .engines import Merge logger = getLogger('clickhouse_orm') @@ -86,8 +87,13 @@ class Model(with_metaclass(ModelBase)): ''' engine = None + + # Insert operations are restricted for read only models readonly = False + # Create table, drop table, insert operations are restricted for system models + system = False + def __init__(self, **kwargs): ''' Creates a model instance, using keyword arguments as field values. @@ -246,3 +252,25 @@ class BufferModel(Model): engine_str = cls.engine.create_table_sql(db_name) parts.append(engine_str) return ' '.join(parts) + + +class MergeModel(Model): + ''' + Model for Merge engine + Predefines virtual _table column an controls that rows can't be inserted to this table type + https://clickhouse.yandex/docs/en/single/index.html#document-table_engines/merge + ''' + readonly = True + + # Virtual fields can't be inserted into database + _table = StringField(readonly=True) + + def set_database(self, db): + ''' + Gets the `Database` that this model instance belongs to. + Returns `None` unless the instance was read from the database or written to it. + ''' + assert isinstance(self.engine, Merge), "engine must be engines.Merge instance" + res = super(MergeModel, self).set_database(db) + self.engine.set_db_name(db.db_name) + return res diff --git a/src/infi/clickhouse_orm/system_models.py b/src/infi/clickhouse_orm/system_models.py index 7edd902..5ca3efd 100644 --- a/src/infi/clickhouse_orm/system_models.py +++ b/src/infi/clickhouse_orm/system_models.py @@ -20,6 +20,7 @@ class SystemPart(Model): OPERATIONS = frozenset({'DETACH', 'DROP', 'ATTACH', 'FREEZE', 'FETCH'}) readonly = True + system = True database = StringField() # Name of the database where the table that this part belongs to is located. table = StringField() # Name of the table that this part belongs to. diff --git a/tests/test_engines.py b/tests/test_engines.py index ddc3a85..65497ca 100644 --- a/tests/test_engines.py +++ b/tests/test_engines.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import unittest from infi.clickhouse_orm.database import Database, DatabaseException -from infi.clickhouse_orm.models import Model +from infi.clickhouse_orm.models import Model, MergeModel from infi.clickhouse_orm.fields import * from infi.clickhouse_orm.engines import * @@ -74,6 +74,56 @@ class EnginesTestCase(unittest.TestCase): engine = Memory() self._create_and_insert(TestModel) + def test_merge(self): + class TestModel1(SampleModel): + engine = TinyLog() + + class TestModel2(SampleModel): + engine = TinyLog() + + class TestMergeModel(MergeModel, SampleModel): + engine = Merge('^testmodel') + + self.database.create_table(TestModel1) + self.database.create_table(TestModel2) + self.database.create_table(TestMergeModel) + + # Insert operations are restricted for this model type + with self.assertRaises(DatabaseException): + self.database.insert([ + TestMergeModel(date='2017-01-01', event_id=23423, event_group=13, event_count=7, event_version=1) + ]) + + # Testing select + self.database.insert([ + TestModel1(date='2017-01-01', event_id=1, event_group=1, event_count=1, event_version=1) + ]) + self.database.insert([ + TestModel2(date='2017-01-02', event_id=2, event_group=2, event_count=2, event_version=2) + ]) + # event_uversion is materialized field. So * won't select it and it will be zero + res = self.database.select('SELECT *, event_uversion FROM $table ORDER BY event_id', model_class=TestMergeModel) + res = [row for row in res] + self.assertEqual(2, len(res)) + self.assertDictEqual({ + '_table': 'testmodel1', + 'date': datetime.date(2017, 1, 1), + 'event_id': 1, + 'event_group': 1, + 'event_count': 1, + 'event_version': 1, + 'event_uversion': 1 + }, res[0].to_dict(include_readonly=True)) + self.assertDictEqual({ + '_table': 'testmodel2', + 'date': datetime.date(2017, 1, 2), + 'event_id': 2, + 'event_group': 2, + 'event_count': 2, + 'event_version': 2, + 'event_uversion': 2 + }, res[1].to_dict(include_readonly=True)) + class SampleModel(Model): diff --git a/tests/test_readonly.py b/tests/test_readonly.py index ae3d54f..facbaa0 100644 --- a/tests/test_readonly.py +++ b/tests/test_readonly.py @@ -1,12 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import unittest -import six -from infi.clickhouse_orm.database import Database, DatabaseException -from infi.clickhouse_orm.models import Model -from infi.clickhouse_orm.fields import * -from infi.clickhouse_orm.engines import * +from infi.clickhouse_orm.database import DatabaseException from .base_test_with_data import * @@ -45,15 +40,15 @@ class ReadonlyTestCase(TestCaseWithData): self.database.insert([m]) def test_create_readonly_table(self): - with self.assertRaises(DatabaseException): - self.database.create_table(ReadOnlyModel) + self.database.create_table(ReadOnlyModel) def test_drop_readonly_table(self): - with self.assertRaises(DatabaseException): - self.database.drop_table(ReadOnlyModel) + self.database.drop_table(ReadOnlyModel) class ReadOnlyModel(Model): readonly = True name = StringField() + date = DateField() + engine = MergeTree('date', ('name',)) diff --git a/tests/test_system_models.py b/tests/test_system_models.py index 1a3b49a..54b6650 100644 --- a/tests/test_system_models.py +++ b/tests/test_system_models.py @@ -2,14 +2,34 @@ from __future__ import unicode_literals import unittest from datetime import date import os -import shutil -from infi.clickhouse_orm.database import Database +from infi.clickhouse_orm.database import Database, DatabaseException from infi.clickhouse_orm.engines import * from infi.clickhouse_orm.fields import * from infi.clickhouse_orm.models import Model from infi.clickhouse_orm.system_models import SystemPart +class SystemTest(unittest.TestCase): + def setUp(self): + self.database = Database('test-db') + + def tearDown(self): + self.database.drop_database() + + def test_insert_system(self): + m = SystemPart() + with self.assertRaises(DatabaseException): + self.database.insert([m]) + + def test_create_readonly_table(self): + with self.assertRaises(DatabaseException): + self.database.create_table(SystemTestModel) + + def test_drop_readonly_table(self): + with self.assertRaises(DatabaseException): + self.database.drop_table(SystemTestModel) + + class SystemPartTest(unittest.TestCase): BACKUP_DIRS = ['/var/lib/clickhouse/shadow', '/opt/clickhouse/shadow/'] @@ -75,3 +95,7 @@ class TestTable(Model): date_field = DateField() engine = MergeTree('date_field', ('date_field',)) + + +class SystemTestModel(Model): + system = True From b7cf611595ba6abcaf6a5e31bda907a743754f50 Mon Sep 17 00:00:00 2001 From: M1ha Date: Fri, 8 Sep 2017 09:15:30 +0500 Subject: [PATCH 03/10] Fixed error text --- src/infi/clickhouse_orm/engines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/infi/clickhouse_orm/engines.py b/src/infi/clickhouse_orm/engines.py index a91fa6c..777bd2e 100644 --- a/src/infi/clickhouse_orm/engines.py +++ b/src/infi/clickhouse_orm/engines.py @@ -141,7 +141,7 @@ class Merge(Engine): """ def __init__(self, table_regex): - assert isinstance(table_regex, six.string_types), "'db_name' parameter must be string" + assert isinstance(table_regex, six.string_types), "'table_regex' parameter must be string" self.table_regex = table_regex From 841340acd46ce50569d14634e56a9d4726a956c6 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sun, 10 Sep 2017 15:46:55 +0300 Subject: [PATCH 04/10] Add `AlterTableWithBuffer` migration operation --- CHANGELOG.md | 4 ++ docs/schema_migrations.md | 13 ++++++- src/infi/clickhouse_orm/migrations.py | 21 +++++++++- tests/sample_migrations/0010.py | 6 +++ tests/sample_migrations/0011.py | 6 +++ tests/test_migrations.py | 56 ++++++++++++++++++++++++++- 6 files changed, 102 insertions(+), 4 deletions(-) create mode 100644 tests/sample_migrations/0010.py create mode 100644 tests/sample_migrations/0011.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d302dc4..1bfa0fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ Change Log ========== +Unreleased +---------- +- Add `AlterTableWithBuffer` migration operation + v0.9.6 ------ - Fix python3 compatibility (TvoroG) diff --git a/docs/schema_migrations.md b/docs/schema_migrations.md index e4647bf..ce56b04 100644 --- a/docs/schema_migrations.md +++ b/docs/schema_migrations.md @@ -34,11 +34,13 @@ The following operations are supported: **CreateTable** -A migration operation that creates a table for a given model class. +A migration operation that creates a table for a given model class. If the table already exists, the operation does nothing. + +In case the model class is a `BufferModel`, the operation first creates the underlying on-disk table, and then creates the buffer table. **DropTable** -A migration operation that drops the table of a given model class. +A migration operation that drops the table of a given model class. If the table does not exist, the operation does nothing. **AlterTable** @@ -50,6 +52,13 @@ A migration operation that compares the table of a given model class to the mode Default values are not altered by this operation. +**AlterTableWithBuffer** + +A compound migration operation for altering a buffer table and its underlying on-disk table. The buffer table is dropped, the on-disk table is altered, and then the buffer table is re-created. This is the procedure recommended in the ClickHouse documentation for handling scenarios in which the underlying table needs to be modified. + +Applying this migration operation to a regular table has the same effect as an `AlterTable` operation. + + Running Migrations ------------------ diff --git a/src/infi/clickhouse_orm/migrations.py b/src/infi/clickhouse_orm/migrations.py index 1a82a5b..a7843a7 100644 --- a/src/infi/clickhouse_orm/migrations.py +++ b/src/infi/clickhouse_orm/migrations.py @@ -82,6 +82,25 @@ class AlterTable(Operation): self._alter_table(database, 'MODIFY COLUMN %s %s' % model_field) +class AlterTableWithBuffer(Operation): + ''' + A migration operation for altering a buffer table and its underlying on-disk table. + The buffer table is dropped, the on-disk table is altered, and then the buffer table + is re-created. + ''' + + def __init__(self, model_class): + self.model_class = model_class + + def apply(self, database): + if issubclass(self.model_class, BufferModel): + DropTable(self.model_class).apply(database) + AlterTable(self.model_class.engine.main_model).apply(database) + CreateTable(self.model_class).apply(database) + else: + AlterTable(self.model_class).apply(database) + + class DropTable(Operation): ''' A migration operation that drops the table of a given model class. @@ -91,7 +110,7 @@ class DropTable(Operation): self.model_class = model_class def apply(self, database): - logger.info(' Drop table %s', self.model_class.__name__) + logger.info(' Drop table %s', self.model_class.table_name()) database.drop_table(self.model_class) diff --git a/tests/sample_migrations/0010.py b/tests/sample_migrations/0010.py new file mode 100644 index 0000000..3892583 --- /dev/null +++ b/tests/sample_migrations/0010.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.CreateTable(Model4Buffer) +] diff --git a/tests/sample_migrations/0011.py b/tests/sample_migrations/0011.py new file mode 100644 index 0000000..dd9d09e --- /dev/null +++ b/tests/sample_migrations/0011.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.AlterTableWithBuffer(Model4Buffer_changed) +] diff --git a/tests/test_migrations.py b/tests/test_migrations.py index 3478f9f..7e31c84 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import unittest from infi.clickhouse_orm.database import Database -from infi.clickhouse_orm.models import Model +from infi.clickhouse_orm.models import Model, BufferModel from infi.clickhouse_orm.fields import * from infi.clickhouse_orm.engines import * from infi.clickhouse_orm.migrations import MigrationHistory @@ -61,6 +61,7 @@ class MigrationsTestCase(unittest.TestCase): self.assertTrue(self.tableExists(EnumModel1)) self.assertEquals(self.getTableFields(EnumModel2), [('date', 'Date'), ('f1', "Enum16('dog' = 1, 'cat' = 2, 'horse' = 3, 'pig' = 4)")]) + # Materialized fields and alias fields self.database.migrate('tests.sample_migrations', 8) self.assertTrue(self.tableExists(MaterializedModel)) self.assertEquals(self.getTableFields(MaterializedModel), @@ -69,6 +70,15 @@ class MigrationsTestCase(unittest.TestCase): self.assertTrue(self.tableExists(AliasModel)) self.assertEquals(self.getTableFields(AliasModel), [('date', 'Date'), ('date_alias', "Date")]) + # Buffer models creation and alteration + self.database.migrate('tests.sample_migrations', 10) + self.assertTrue(self.tableExists(Model4)) + self.assertTrue(self.tableExists(Model4Buffer)) + self.assertEquals(self.getTableFields(Model4), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) + self.assertEquals(self.getTableFields(Model4Buffer), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) + self.database.migrate('tests.sample_migrations', 11) + self.assertEquals(self.getTableFields(Model4), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')]) + self.assertEquals(self.getTableFields(Model4Buffer), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')]) # Several different models with the same table name, to simulate a table that changes over time @@ -159,3 +169,47 @@ class AliasModel(Model): @classmethod def table_name(cls): return 'alias_date' + + +class Model4(Model): + + date = DateField() + f1 = Int32Field() + f2 = StringField() + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'model4' + + +class Model4Buffer(BufferModel, Model4): + + engine = Buffer(Model4) + + @classmethod + def table_name(cls): + return 'model4buffer' + + +class Model4_changed(Model): + + date = DateField() + f3 = DateTimeField() + f2 = StringField() + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'model4' + + +class Model4Buffer_changed(BufferModel, Model4_changed): + + engine = Buffer(Model4_changed) + + @classmethod + def table_name(cls): + return 'model4buffer' From 01f91de54b0cc2f69e8688590dcce31054414e56 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sun, 10 Sep 2017 17:17:04 +0300 Subject: [PATCH 05/10] Add `distinct` method to querysets --- CHANGELOG.md | 1 + docs/class_reference.md | 30 +++++++++++++++++++++-------- docs/querysets.md | 10 ++++++++++ docs/toc.md | 1 + scripts/generate_ref.py | 5 ++++- src/infi/clickhouse_orm/query.py | 33 +++++++++++++++++++++++++------- tests/test_querysets.py | 16 +++++++++++++++- 7 files changed, 79 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bfa0fc..8143834 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ Change Log Unreleased ---------- +- Add `distinct` method to querysets - Add `AlterTableWithBuffer` migration operation v0.9.6 diff --git a/docs/class_reference.md b/docs/class_reference.md index 7e4bc74..f5ef191 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -7,7 +7,7 @@ infi.clickhouse_orm.database ### Database -Database instances connect to a specific ClickHouse database for running queries, +Database instances connect to a specific ClickHouse database for running queries, inserting data and other operations. #### Database(db_name, db_url="http://localhost:8123/", username=None, password=None, readonly=False, autocreate=True) @@ -71,7 +71,7 @@ Insert records into the database. Executes schema migrations. -- `migrations_package_name` - fully qualified name of the Python package +- `migrations_package_name` - fully qualified name of the Python package containing the migrations. - `up_to` - number of the last migration to apply. @@ -89,7 +89,7 @@ Selects records and returns a single page of model instances. - `conditions`: optional SQL conditions (contents of the WHERE clause). - `settings`: query settings to send as HTTP GET parameters -The result is a namedtuple containing `objects` (list), `number_of_objects`, +The result is a namedtuple containing `objects` (list), `number_of_objects`, `pages_total`, `number` (of the current page), and `page_size`. @@ -128,7 +128,7 @@ infi.clickhouse_orm.models A base class for ORM models. Each model class represent a ClickHouse table. For example: - + class CPUStats(Model): timestamp = DateTimeField() cpu_id = UInt16Field() @@ -172,7 +172,7 @@ If omitted, it is assumed to be the names of all fields in the model, in order o #### get_database() -Gets the `Database` that this model instance belongs to. +Gets the `Database` that this model instance belongs to. Returns `None` unless the instance was read from the database or written to it. @@ -191,7 +191,7 @@ Returns a `QuerySet` for selecting instances of this model class. #### set_database(db) -Sets the `Database` that this model instance belongs to. +Sets the `Database` that this model instance belongs to. This is done automatically when the instance is read from the database or written to it. @@ -261,7 +261,7 @@ If omitted, it is assumed to be the names of all fields in the model, in order o #### get_database() -Gets the `Database` that this model instance belongs to. +Gets the `Database` that this model instance belongs to. Returns `None` unless the instance was read from the database or written to it. @@ -280,7 +280,7 @@ Returns a `QuerySet` for selecting instances of this model class. #### set_database(db) -Sets the `Database` that this model instance belongs to. +Sets the `Database` that this model instance belongs to. This is done automatically when the instance is read from the database or written to it. @@ -585,6 +585,13 @@ Returns the contents of the query's `WHERE` clause as a string. Returns the number of matching model instances. +#### distinct() + + +Adds a DISTINCT clause to the query, meaning that any duplicate rows +in the results will be omitted. + + #### exclude(**kwargs) @@ -678,6 +685,13 @@ Returns the contents of the query's `WHERE` clause as a string. Returns the number of rows after aggregation. +#### distinct() + + +Adds a DISTINCT clause to the query, meaning that any duplicate rows +in the results will be omitted. + + #### exclude(**kwargs) diff --git a/docs/querysets.md b/docs/querysets.md index 2bbefd9..d27c836 100644 --- a/docs/querysets.md +++ b/docs/querysets.md @@ -99,6 +99,16 @@ When some of the model fields aren't needed, it is more efficient to omit them f qs = Person.objects_in(database).only('first_name', 'birthday') +Distinct +-------- + +Adds a DISTINCT clause to the query, meaning that any duplicate rows in the results will be omitted. + + >>> Person.objects_in(database).only('first_name').count() + 100 + >>> Person.objects_in(database).only('first_name').distinct().count() + 94 + Slicing ------- diff --git a/docs/toc.md b/docs/toc.md index aa5bb3b..99c486d 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -20,6 +20,7 @@ * [Counting and Checking Existence](querysets.md#counting-and-checking-existence) * [Ordering](querysets.md#ordering) * [Omitting Fields](querysets.md#omitting-fields) + * [Distinct](querysets.md#distinct) * [Slicing](querysets.md#slicing) * [Pagination](querysets.md#pagination) * [Aggregation](querysets.md#aggregation) diff --git a/scripts/generate_ref.py b/scripts/generate_ref.py index c35e881..d2863fd 100644 --- a/scripts/generate_ref.py +++ b/scripts/generate_ref.py @@ -51,7 +51,10 @@ def get_method_sig(method): for arg in argspec.args: default_arg = _get_default_arg(argspec.args, argspec.defaults, arg_index) if default_arg.has_default: - args.append("%s=%s" % (arg, default_arg.default_value)) + val = default_arg.default_value + if isinstance(val, basestring): + val = '"' + val + '"' + args.append("%s=%s" % (arg, val)) else: args.append(arg) arg_index += 1 diff --git a/src/infi/clickhouse_orm/query.py b/src/infi/clickhouse_orm/query.py index c1c1dd0..0bf764a 100644 --- a/src/infi/clickhouse_orm/query.py +++ b/src/infi/clickhouse_orm/query.py @@ -187,6 +187,7 @@ class QuerySet(object): self._q = [] self._fields = [] self._limits = None + self._distinct = False def __iter__(self): """ @@ -228,14 +229,15 @@ class QuerySet(object): """ Returns the whole query as a SQL string. """ + distinct = 'DISTINCT ' if self._distinct else '' fields = '*' if self._fields: fields = comma_join('`%s`' % field for field in self._fields) ordering = '\nORDER BY ' + self.order_by_as_sql() if self._order_by else '' limit = '\nLIMIT %d, %d' % self._limits if self._limits else '' - params = (fields, self._model_cls.table_name(), + params = (distinct, fields, self._model_cls.table_name(), self.conditions_as_sql(), ordering, limit) - return u'SELECT %s\nFROM `%s`\nWHERE %s%s%s' % params + return u'SELECT %s%s\nFROM `%s`\nWHERE %s%s%s' % params def order_by_as_sql(self): """ @@ -259,6 +261,11 @@ class QuerySet(object): """ Returns the number of matching model instances. """ + if self._distinct: + # Use a subquery, since a simple count won't be accurate + sql = u'SELECT count() FROM (%s)' % self.as_sql() + raw = self._database.raw(sql) + return int(raw) if raw else 0 return self._database.count(self._model_cls, self.conditions_as_sql()) def order_by(self, *field_names): @@ -296,7 +303,7 @@ class QuerySet(object): return qs def paginate(self, page_num=1, page_size=100): - ''' + """ Returns a single page of model instances that match the queryset. Note that `order_by` should be used first, to ensure a correct partitioning of records into pages. @@ -306,7 +313,7 @@ class QuerySet(object): The result is a namedtuple containing `objects` (list), `number_of_objects`, `pages_total`, `number` (of the current page), and `page_size`. - ''' + """ from .database import Page count = self.count() pages_total = int(ceil(count / float(page_size))) @@ -323,8 +330,17 @@ class QuerySet(object): page_size=page_size ) + def distinct(self): + """ + Adds a DISTINCT clause to the query, meaning that any duplicate rows + in the results will be omitted. + """ + qs = copy(self) + qs._distinct = True + return qs + def aggregate(self, *args, **kwargs): - ''' + """ Returns an `AggregateQuerySet` over this query, with `args` serving as grouping fields and `kwargs` serving as calculated fields. At least one calculated field is required. For example: @@ -337,7 +353,7 @@ class QuerySet(object): WHERE data > '2017-08-01' GROUP BY event_type ``` - ''' + """ return AggregateQuerySet(self, args, kwargs) @@ -368,6 +384,7 @@ class AggregateQuerySet(QuerySet): self._order_by = list(base_qs._order_by) self._q = list(base_qs._q) self._limits = base_qs._limits + self._distinct = base_qs._distinct def group_by(self, *args): """ @@ -398,15 +415,17 @@ class AggregateQuerySet(QuerySet): """ Returns the whole query as a SQL string. """ + distinct = 'DISTINCT ' if self._distinct else '' grouping = comma_join('`%s`' % field for field in self._grouping_fields) fields = comma_join(list(self._fields) + ['%s AS %s' % (v, k) for k, v in self._calculated_fields.items()]) params = dict( + distinct=distinct, grouping=grouping or "''", fields=fields, table=self._model_cls.table_name(), conds=self.conditions_as_sql() ) - sql = u'SELECT %(fields)s\nFROM `%(table)s`\nWHERE %(conds)s\nGROUP BY %(grouping)s' % params + sql = u'SELECT %(distinct)s%(fields)s\nFROM `%(table)s`\nWHERE %(conds)s\nGROUP BY %(grouping)s' % params if self._order_by: sql += '\nORDER BY ' + self.order_by_as_sql() if self._limits: diff --git a/tests/test_querysets.py b/tests/test_querysets.py index ad834bb..cbbc65d 100644 --- a/tests/test_querysets.py +++ b/tests/test_querysets.py @@ -21,8 +21,11 @@ class QuerySetTestCase(TestCaseWithData): def _test_qs(self, qs, expected_count): logging.info(qs.as_sql()) + count = 0 for instance in qs: - logging.info('\t%s' % instance.to_dict()) + count += 1 + logging.info('\t[%d]\t%s' % (count, instance.to_dict())) + self.assertEquals(count, expected_count) self.assertEquals(qs.count(), expected_count) def test_no_filtering(self): @@ -202,6 +205,11 @@ class QuerySetTestCase(TestCaseWithData): page = qs.paginate(1, 100) self.assertEquals(page.number_of_objects, 10) + def test_distinct(self): + qs = Person.objects_in(self.database).distinct() + self._test_qs(qs, 100) + self._test_qs(qs.only('first_name'), 94) + class AggregateTestCase(TestCaseWithData): @@ -310,6 +318,12 @@ class AggregateTestCase(TestCaseWithData): qs = qs.filter(weekday=1) self.assertEquals(qs.count(), 1) + def test_aggregate_with_distinct(self): + # In this case distinct has no effect + qs = Person.objects_in(self.database).aggregate(average_height='avg(height)').distinct() + print(qs.as_sql()) + self.assertEquals(qs.count(), 1) + Color = Enum('Color', u'red blue green yellow brown white black') From 430872b9589452c65185d0e61e09190e19a60b2d Mon Sep 17 00:00:00 2001 From: M1ha Date: Mon, 11 Sep 2017 10:17:06 +0500 Subject: [PATCH 06/10] Added readonly parameter to all Field subclasses --- src/infi/clickhouse_orm/fields.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 0cf2543..a114db3 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -93,10 +93,10 @@ class StringField(Field): class FixedStringField(StringField): - def __init__(self, length, default=None, alias=None, materialized=None): + def __init__(self, length, default=None, alias=None, materialized=None, readonly=None): self._length = length self.db_type = 'FixedString(%d)' % length - super(FixedStringField, self).__init__(default, alias, materialized) + super(FixedStringField, self).__init__(default, alias, materialized,readonly) def to_python(self, value, timezone_in_use): value = super(FixedStringField, self).to_python(value, timezone_in_use) @@ -272,11 +272,11 @@ class BaseEnumField(Field): Abstract base class for all enum-type fields. ''' - def __init__(self, enum_cls, default=None, alias=None, materialized=None): + def __init__(self, enum_cls, default=None, alias=None, materialized=None, readonly=None): self.enum_cls = enum_cls if default is None: default = list(enum_cls)[0] - super(BaseEnumField, self).__init__(default, alias, materialized) + super(BaseEnumField, self).__init__(default, alias, materialized, readonly) def to_python(self, value, timezone_in_use): if isinstance(value, self.enum_cls): @@ -336,9 +336,9 @@ class ArrayField(Field): class_default = [] - def __init__(self, inner_field, default=None, alias=None, materialized=None): + def __init__(self, inner_field, default=None, alias=None, materialized=None, readonly=None): self.inner_field = inner_field - super(ArrayField, self).__init__(default, alias, materialized) + super(ArrayField, self).__init__(default, alias, materialized, readonly) def to_python(self, value, timezone_in_use): if isinstance(value, text_type): @@ -372,7 +372,7 @@ class NullableField(Field): self._null_values = [None] if extra_null_values: self._null_values.extend(extra_null_values) - super(NullableField, self).__init__(default, alias, materialized) + super(NullableField, self).__init__(default, alias, materialized, readonly=None) def to_python(self, value, timezone_in_use): if value == '\\N' or value is None: From 7bbcae574a12c6e316a904ff44f3b4a73b3c8c34 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sun, 10 Sep 2017 15:46:55 +0300 Subject: [PATCH 07/10] Add `AlterTableWithBuffer` migration operation --- CHANGELOG.md | 4 ++ docs/schema_migrations.md | 13 ++++++- src/infi/clickhouse_orm/migrations.py | 21 +++++++++- tests/sample_migrations/0010.py | 6 +++ tests/sample_migrations/0011.py | 6 +++ tests/test_migrations.py | 56 ++++++++++++++++++++++++++- 6 files changed, 102 insertions(+), 4 deletions(-) create mode 100644 tests/sample_migrations/0010.py create mode 100644 tests/sample_migrations/0011.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d302dc4..1bfa0fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ Change Log ========== +Unreleased +---------- +- Add `AlterTableWithBuffer` migration operation + v0.9.6 ------ - Fix python3 compatibility (TvoroG) diff --git a/docs/schema_migrations.md b/docs/schema_migrations.md index e4647bf..ce56b04 100644 --- a/docs/schema_migrations.md +++ b/docs/schema_migrations.md @@ -34,11 +34,13 @@ The following operations are supported: **CreateTable** -A migration operation that creates a table for a given model class. +A migration operation that creates a table for a given model class. If the table already exists, the operation does nothing. + +In case the model class is a `BufferModel`, the operation first creates the underlying on-disk table, and then creates the buffer table. **DropTable** -A migration operation that drops the table of a given model class. +A migration operation that drops the table of a given model class. If the table does not exist, the operation does nothing. **AlterTable** @@ -50,6 +52,13 @@ A migration operation that compares the table of a given model class to the mode Default values are not altered by this operation. +**AlterTableWithBuffer** + +A compound migration operation for altering a buffer table and its underlying on-disk table. The buffer table is dropped, the on-disk table is altered, and then the buffer table is re-created. This is the procedure recommended in the ClickHouse documentation for handling scenarios in which the underlying table needs to be modified. + +Applying this migration operation to a regular table has the same effect as an `AlterTable` operation. + + Running Migrations ------------------ diff --git a/src/infi/clickhouse_orm/migrations.py b/src/infi/clickhouse_orm/migrations.py index 1a82a5b..a7843a7 100644 --- a/src/infi/clickhouse_orm/migrations.py +++ b/src/infi/clickhouse_orm/migrations.py @@ -82,6 +82,25 @@ class AlterTable(Operation): self._alter_table(database, 'MODIFY COLUMN %s %s' % model_field) +class AlterTableWithBuffer(Operation): + ''' + A migration operation for altering a buffer table and its underlying on-disk table. + The buffer table is dropped, the on-disk table is altered, and then the buffer table + is re-created. + ''' + + def __init__(self, model_class): + self.model_class = model_class + + def apply(self, database): + if issubclass(self.model_class, BufferModel): + DropTable(self.model_class).apply(database) + AlterTable(self.model_class.engine.main_model).apply(database) + CreateTable(self.model_class).apply(database) + else: + AlterTable(self.model_class).apply(database) + + class DropTable(Operation): ''' A migration operation that drops the table of a given model class. @@ -91,7 +110,7 @@ class DropTable(Operation): self.model_class = model_class def apply(self, database): - logger.info(' Drop table %s', self.model_class.__name__) + logger.info(' Drop table %s', self.model_class.table_name()) database.drop_table(self.model_class) diff --git a/tests/sample_migrations/0010.py b/tests/sample_migrations/0010.py new file mode 100644 index 0000000..3892583 --- /dev/null +++ b/tests/sample_migrations/0010.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.CreateTable(Model4Buffer) +] diff --git a/tests/sample_migrations/0011.py b/tests/sample_migrations/0011.py new file mode 100644 index 0000000..dd9d09e --- /dev/null +++ b/tests/sample_migrations/0011.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.AlterTableWithBuffer(Model4Buffer_changed) +] diff --git a/tests/test_migrations.py b/tests/test_migrations.py index 3478f9f..7e31c84 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import unittest from infi.clickhouse_orm.database import Database -from infi.clickhouse_orm.models import Model +from infi.clickhouse_orm.models import Model, BufferModel from infi.clickhouse_orm.fields import * from infi.clickhouse_orm.engines import * from infi.clickhouse_orm.migrations import MigrationHistory @@ -61,6 +61,7 @@ class MigrationsTestCase(unittest.TestCase): self.assertTrue(self.tableExists(EnumModel1)) self.assertEquals(self.getTableFields(EnumModel2), [('date', 'Date'), ('f1', "Enum16('dog' = 1, 'cat' = 2, 'horse' = 3, 'pig' = 4)")]) + # Materialized fields and alias fields self.database.migrate('tests.sample_migrations', 8) self.assertTrue(self.tableExists(MaterializedModel)) self.assertEquals(self.getTableFields(MaterializedModel), @@ -69,6 +70,15 @@ class MigrationsTestCase(unittest.TestCase): self.assertTrue(self.tableExists(AliasModel)) self.assertEquals(self.getTableFields(AliasModel), [('date', 'Date'), ('date_alias', "Date")]) + # Buffer models creation and alteration + self.database.migrate('tests.sample_migrations', 10) + self.assertTrue(self.tableExists(Model4)) + self.assertTrue(self.tableExists(Model4Buffer)) + self.assertEquals(self.getTableFields(Model4), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) + self.assertEquals(self.getTableFields(Model4Buffer), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) + self.database.migrate('tests.sample_migrations', 11) + self.assertEquals(self.getTableFields(Model4), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')]) + self.assertEquals(self.getTableFields(Model4Buffer), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')]) # Several different models with the same table name, to simulate a table that changes over time @@ -159,3 +169,47 @@ class AliasModel(Model): @classmethod def table_name(cls): return 'alias_date' + + +class Model4(Model): + + date = DateField() + f1 = Int32Field() + f2 = StringField() + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'model4' + + +class Model4Buffer(BufferModel, Model4): + + engine = Buffer(Model4) + + @classmethod + def table_name(cls): + return 'model4buffer' + + +class Model4_changed(Model): + + date = DateField() + f3 = DateTimeField() + f2 = StringField() + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'model4' + + +class Model4Buffer_changed(BufferModel, Model4_changed): + + engine = Buffer(Model4_changed) + + @classmethod + def table_name(cls): + return 'model4buffer' From 59564f8c70f614fba2ba900eb0ada8a784db8248 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sun, 10 Sep 2017 17:17:04 +0300 Subject: [PATCH 08/10] Add `distinct` method to querysets --- CHANGELOG.md | 1 + docs/class_reference.md | 30 +++++++++++++++++++++-------- docs/querysets.md | 10 ++++++++++ docs/toc.md | 1 + scripts/generate_ref.py | 5 ++++- src/infi/clickhouse_orm/query.py | 33 +++++++++++++++++++++++++------- tests/test_querysets.py | 16 +++++++++++++++- 7 files changed, 79 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bfa0fc..8143834 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ Change Log Unreleased ---------- +- Add `distinct` method to querysets - Add `AlterTableWithBuffer` migration operation v0.9.6 diff --git a/docs/class_reference.md b/docs/class_reference.md index 7e4bc74..f5ef191 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -7,7 +7,7 @@ infi.clickhouse_orm.database ### Database -Database instances connect to a specific ClickHouse database for running queries, +Database instances connect to a specific ClickHouse database for running queries, inserting data and other operations. #### Database(db_name, db_url="http://localhost:8123/", username=None, password=None, readonly=False, autocreate=True) @@ -71,7 +71,7 @@ Insert records into the database. Executes schema migrations. -- `migrations_package_name` - fully qualified name of the Python package +- `migrations_package_name` - fully qualified name of the Python package containing the migrations. - `up_to` - number of the last migration to apply. @@ -89,7 +89,7 @@ Selects records and returns a single page of model instances. - `conditions`: optional SQL conditions (contents of the WHERE clause). - `settings`: query settings to send as HTTP GET parameters -The result is a namedtuple containing `objects` (list), `number_of_objects`, +The result is a namedtuple containing `objects` (list), `number_of_objects`, `pages_total`, `number` (of the current page), and `page_size`. @@ -128,7 +128,7 @@ infi.clickhouse_orm.models A base class for ORM models. Each model class represent a ClickHouse table. For example: - + class CPUStats(Model): timestamp = DateTimeField() cpu_id = UInt16Field() @@ -172,7 +172,7 @@ If omitted, it is assumed to be the names of all fields in the model, in order o #### get_database() -Gets the `Database` that this model instance belongs to. +Gets the `Database` that this model instance belongs to. Returns `None` unless the instance was read from the database or written to it. @@ -191,7 +191,7 @@ Returns a `QuerySet` for selecting instances of this model class. #### set_database(db) -Sets the `Database` that this model instance belongs to. +Sets the `Database` that this model instance belongs to. This is done automatically when the instance is read from the database or written to it. @@ -261,7 +261,7 @@ If omitted, it is assumed to be the names of all fields in the model, in order o #### get_database() -Gets the `Database` that this model instance belongs to. +Gets the `Database` that this model instance belongs to. Returns `None` unless the instance was read from the database or written to it. @@ -280,7 +280,7 @@ Returns a `QuerySet` for selecting instances of this model class. #### set_database(db) -Sets the `Database` that this model instance belongs to. +Sets the `Database` that this model instance belongs to. This is done automatically when the instance is read from the database or written to it. @@ -585,6 +585,13 @@ Returns the contents of the query's `WHERE` clause as a string. Returns the number of matching model instances. +#### distinct() + + +Adds a DISTINCT clause to the query, meaning that any duplicate rows +in the results will be omitted. + + #### exclude(**kwargs) @@ -678,6 +685,13 @@ Returns the contents of the query's `WHERE` clause as a string. Returns the number of rows after aggregation. +#### distinct() + + +Adds a DISTINCT clause to the query, meaning that any duplicate rows +in the results will be omitted. + + #### exclude(**kwargs) diff --git a/docs/querysets.md b/docs/querysets.md index 2bbefd9..d27c836 100644 --- a/docs/querysets.md +++ b/docs/querysets.md @@ -99,6 +99,16 @@ When some of the model fields aren't needed, it is more efficient to omit them f qs = Person.objects_in(database).only('first_name', 'birthday') +Distinct +-------- + +Adds a DISTINCT clause to the query, meaning that any duplicate rows in the results will be omitted. + + >>> Person.objects_in(database).only('first_name').count() + 100 + >>> Person.objects_in(database).only('first_name').distinct().count() + 94 + Slicing ------- diff --git a/docs/toc.md b/docs/toc.md index 0f83389..f994141 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -20,6 +20,7 @@ * [Counting and Checking Existence](querysets.md#counting-and-checking-existence) * [Ordering](querysets.md#ordering) * [Omitting Fields](querysets.md#omitting-fields) + * [Distinct](querysets.md#distinct) * [Slicing](querysets.md#slicing) * [Pagination](querysets.md#pagination) * [Aggregation](querysets.md#aggregation) diff --git a/scripts/generate_ref.py b/scripts/generate_ref.py index c35e881..d2863fd 100644 --- a/scripts/generate_ref.py +++ b/scripts/generate_ref.py @@ -51,7 +51,10 @@ def get_method_sig(method): for arg in argspec.args: default_arg = _get_default_arg(argspec.args, argspec.defaults, arg_index) if default_arg.has_default: - args.append("%s=%s" % (arg, default_arg.default_value)) + val = default_arg.default_value + if isinstance(val, basestring): + val = '"' + val + '"' + args.append("%s=%s" % (arg, val)) else: args.append(arg) arg_index += 1 diff --git a/src/infi/clickhouse_orm/query.py b/src/infi/clickhouse_orm/query.py index c1c1dd0..0bf764a 100644 --- a/src/infi/clickhouse_orm/query.py +++ b/src/infi/clickhouse_orm/query.py @@ -187,6 +187,7 @@ class QuerySet(object): self._q = [] self._fields = [] self._limits = None + self._distinct = False def __iter__(self): """ @@ -228,14 +229,15 @@ class QuerySet(object): """ Returns the whole query as a SQL string. """ + distinct = 'DISTINCT ' if self._distinct else '' fields = '*' if self._fields: fields = comma_join('`%s`' % field for field in self._fields) ordering = '\nORDER BY ' + self.order_by_as_sql() if self._order_by else '' limit = '\nLIMIT %d, %d' % self._limits if self._limits else '' - params = (fields, self._model_cls.table_name(), + params = (distinct, fields, self._model_cls.table_name(), self.conditions_as_sql(), ordering, limit) - return u'SELECT %s\nFROM `%s`\nWHERE %s%s%s' % params + return u'SELECT %s%s\nFROM `%s`\nWHERE %s%s%s' % params def order_by_as_sql(self): """ @@ -259,6 +261,11 @@ class QuerySet(object): """ Returns the number of matching model instances. """ + if self._distinct: + # Use a subquery, since a simple count won't be accurate + sql = u'SELECT count() FROM (%s)' % self.as_sql() + raw = self._database.raw(sql) + return int(raw) if raw else 0 return self._database.count(self._model_cls, self.conditions_as_sql()) def order_by(self, *field_names): @@ -296,7 +303,7 @@ class QuerySet(object): return qs def paginate(self, page_num=1, page_size=100): - ''' + """ Returns a single page of model instances that match the queryset. Note that `order_by` should be used first, to ensure a correct partitioning of records into pages. @@ -306,7 +313,7 @@ class QuerySet(object): The result is a namedtuple containing `objects` (list), `number_of_objects`, `pages_total`, `number` (of the current page), and `page_size`. - ''' + """ from .database import Page count = self.count() pages_total = int(ceil(count / float(page_size))) @@ -323,8 +330,17 @@ class QuerySet(object): page_size=page_size ) + def distinct(self): + """ + Adds a DISTINCT clause to the query, meaning that any duplicate rows + in the results will be omitted. + """ + qs = copy(self) + qs._distinct = True + return qs + def aggregate(self, *args, **kwargs): - ''' + """ Returns an `AggregateQuerySet` over this query, with `args` serving as grouping fields and `kwargs` serving as calculated fields. At least one calculated field is required. For example: @@ -337,7 +353,7 @@ class QuerySet(object): WHERE data > '2017-08-01' GROUP BY event_type ``` - ''' + """ return AggregateQuerySet(self, args, kwargs) @@ -368,6 +384,7 @@ class AggregateQuerySet(QuerySet): self._order_by = list(base_qs._order_by) self._q = list(base_qs._q) self._limits = base_qs._limits + self._distinct = base_qs._distinct def group_by(self, *args): """ @@ -398,15 +415,17 @@ class AggregateQuerySet(QuerySet): """ Returns the whole query as a SQL string. """ + distinct = 'DISTINCT ' if self._distinct else '' grouping = comma_join('`%s`' % field for field in self._grouping_fields) fields = comma_join(list(self._fields) + ['%s AS %s' % (v, k) for k, v in self._calculated_fields.items()]) params = dict( + distinct=distinct, grouping=grouping or "''", fields=fields, table=self._model_cls.table_name(), conds=self.conditions_as_sql() ) - sql = u'SELECT %(fields)s\nFROM `%(table)s`\nWHERE %(conds)s\nGROUP BY %(grouping)s' % params + sql = u'SELECT %(distinct)s%(fields)s\nFROM `%(table)s`\nWHERE %(conds)s\nGROUP BY %(grouping)s' % params if self._order_by: sql += '\nORDER BY ' + self.order_by_as_sql() if self._limits: diff --git a/tests/test_querysets.py b/tests/test_querysets.py index ad834bb..cbbc65d 100644 --- a/tests/test_querysets.py +++ b/tests/test_querysets.py @@ -21,8 +21,11 @@ class QuerySetTestCase(TestCaseWithData): def _test_qs(self, qs, expected_count): logging.info(qs.as_sql()) + count = 0 for instance in qs: - logging.info('\t%s' % instance.to_dict()) + count += 1 + logging.info('\t[%d]\t%s' % (count, instance.to_dict())) + self.assertEquals(count, expected_count) self.assertEquals(qs.count(), expected_count) def test_no_filtering(self): @@ -202,6 +205,11 @@ class QuerySetTestCase(TestCaseWithData): page = qs.paginate(1, 100) self.assertEquals(page.number_of_objects, 10) + def test_distinct(self): + qs = Person.objects_in(self.database).distinct() + self._test_qs(qs, 100) + self._test_qs(qs.only('first_name'), 94) + class AggregateTestCase(TestCaseWithData): @@ -310,6 +318,12 @@ class AggregateTestCase(TestCaseWithData): qs = qs.filter(weekday=1) self.assertEquals(qs.count(), 1) + def test_aggregate_with_distinct(self): + # In this case distinct has no effect + qs = Person.objects_in(self.database).aggregate(average_height='avg(height)').distinct() + print(qs.as_sql()) + self.assertEquals(qs.count(), 1) + Color = Enum('Color', u'red blue green yellow brown white black') From 8304ddca5ca3e44f9243bf9d332e12d195b183b1 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Wed, 13 Sep 2017 12:15:44 +0300 Subject: [PATCH 09/10] Update docs --- docs/class_reference.md | 55 +++++++++++++++++++------------ docs/toc.md | 1 + src/infi/clickhouse_orm/fields.py | 2 +- 3 files changed, 36 insertions(+), 22 deletions(-) diff --git a/docs/class_reference.md b/docs/class_reference.md index f5ef191..e56fc7a 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -317,28 +317,28 @@ infi.clickhouse_orm.fields Abstract base class for all field types. -#### Field(default=None, alias=None, materialized=None) +#### Field(default=None, alias=None, materialized=None, readonly=None) ### StringField Extends Field -#### StringField(default=None, alias=None, materialized=None) +#### StringField(default=None, alias=None, materialized=None, readonly=None) ### DateField Extends Field -#### DateField(default=None, alias=None, materialized=None) +#### DateField(default=None, alias=None, materialized=None, readonly=None) ### DateTimeField Extends Field -#### DateTimeField(default=None, alias=None, materialized=None) +#### DateTimeField(default=None, alias=None, materialized=None, readonly=None) ### BaseIntField @@ -348,7 +348,7 @@ Extends Field Abstract base class for all integer-type fields. -#### BaseIntField(default=None, alias=None, materialized=None) +#### BaseIntField(default=None, alias=None, materialized=None, readonly=None) ### BaseFloatField @@ -358,7 +358,7 @@ Extends Field Abstract base class for all float-type fields. -#### BaseFloatField(default=None, alias=None, materialized=None) +#### BaseFloatField(default=None, alias=None, materialized=None, readonly=None) ### BaseEnumField @@ -368,14 +368,14 @@ Extends Field Abstract base class for all enum-type fields. -#### BaseEnumField(enum_cls, default=None, alias=None, materialized=None) +#### BaseEnumField(enum_cls, default=None, alias=None, materialized=None, readonly=None) ### ArrayField Extends Field -#### ArrayField(inner_field, default=None, alias=None, materialized=None) +#### ArrayField(inner_field, default=None, alias=None, materialized=None, readonly=None) ### NullableField @@ -389,91 +389,91 @@ Extends Field Extends StringField -#### FixedStringField(length, default=None, alias=None, materialized=None) +#### FixedStringField(length, default=None, alias=None, materialized=None, readonly=None) ### UInt8Field Extends BaseIntField -#### UInt8Field(default=None, alias=None, materialized=None) +#### UInt8Field(default=None, alias=None, materialized=None, readonly=None) ### UInt16Field Extends BaseIntField -#### UInt16Field(default=None, alias=None, materialized=None) +#### UInt16Field(default=None, alias=None, materialized=None, readonly=None) ### UInt32Field Extends BaseIntField -#### UInt32Field(default=None, alias=None, materialized=None) +#### UInt32Field(default=None, alias=None, materialized=None, readonly=None) ### UInt64Field Extends BaseIntField -#### UInt64Field(default=None, alias=None, materialized=None) +#### UInt64Field(default=None, alias=None, materialized=None, readonly=None) ### Int8Field Extends BaseIntField -#### Int8Field(default=None, alias=None, materialized=None) +#### Int8Field(default=None, alias=None, materialized=None, readonly=None) ### Int16Field Extends BaseIntField -#### Int16Field(default=None, alias=None, materialized=None) +#### Int16Field(default=None, alias=None, materialized=None, readonly=None) ### Int32Field Extends BaseIntField -#### Int32Field(default=None, alias=None, materialized=None) +#### Int32Field(default=None, alias=None, materialized=None, readonly=None) ### Int64Field Extends BaseIntField -#### Int64Field(default=None, alias=None, materialized=None) +#### Int64Field(default=None, alias=None, materialized=None, readonly=None) ### Float32Field Extends BaseFloatField -#### Float32Field(default=None, alias=None, materialized=None) +#### Float32Field(default=None, alias=None, materialized=None, readonly=None) ### Float64Field Extends BaseFloatField -#### Float64Field(default=None, alias=None, materialized=None) +#### Float64Field(default=None, alias=None, materialized=None, readonly=None) ### Enum8Field Extends BaseEnumField -#### Enum8Field(enum_cls, default=None, alias=None, materialized=None) +#### Enum8Field(enum_cls, default=None, alias=None, materialized=None, readonly=None) ### Enum16Field Extends BaseEnumField -#### Enum16Field(enum_cls, default=None, alias=None, materialized=None) +#### Enum16Field(enum_cls, default=None, alias=None, materialized=None, readonly=None) infi.clickhouse_orm.engines @@ -512,6 +512,19 @@ Read more [here](https://clickhouse.yandex/reference_en.html#Buffer). #### Buffer(main_model, num_layers=16, min_time=10, max_time=100, min_rows=10000, max_rows=1000000, min_bytes=10000000, max_bytes=100000000) +### Merge + +Extends Engine + + +The Merge engine (not to be confused with MergeTree) does not store data itself, +but allows reading from any number of other tables simultaneously. +Writing to a table is not supported +https://clickhouse.yandex/docs/en/single/index.html#document-table_engines/merge + +#### Merge(table_regex) + + ### CollapsingMergeTree Extends MergeTree diff --git a/docs/toc.md b/docs/toc.md index f994141..cc1c8a6 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -87,6 +87,7 @@ * [Memory](class_reference.md#memory) * [MergeTree](class_reference.md#mergetree) * [Buffer](class_reference.md#buffer) + * [Merge](class_reference.md#merge) * [CollapsingMergeTree](class_reference.md#collapsingmergetree) * [SummingMergeTree](class_reference.md#summingmergetree) * [ReplacingMergeTree](class_reference.md#replacingmergetree) diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index a114db3..3e3207a 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -96,7 +96,7 @@ class FixedStringField(StringField): def __init__(self, length, default=None, alias=None, materialized=None, readonly=None): self._length = length self.db_type = 'FixedString(%d)' % length - super(FixedStringField, self).__init__(default, alias, materialized,readonly) + super(FixedStringField, self).__init__(default, alias, materialized, readonly) def to_python(self, value, timezone_in_use): value = super(FixedStringField, self).to_python(value, timezone_in_use) From 77fe6704d8bb0dbf4a03295778131d174ceee8cf Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Wed, 13 Sep 2017 12:24:51 +0300 Subject: [PATCH 10/10] Releasing v0.9.7 --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8143834..a32d9d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,11 @@ Change Log ========== -Unreleased ----------- +v0.9.7 +------ - Add `distinct` method to querysets - Add `AlterTableWithBuffer` migration operation +- Support Merge engine (M1hacka) v0.9.6 ------