From 1b37c38bbf39c5dcb26ea194b89b180e1170b6fd Mon Sep 17 00:00:00 2001 From: M1ha Date: Mon, 15 Jul 2019 13:01:45 +0500 Subject: [PATCH 1/8] Added primary_key parameter --- docs/class_reference.md | 8 ++++---- docs/table_engines.md | 13 +++++++++++++ src/infi/clickhouse_orm/engines.py | 23 ++++++++++++++++------- tests/test_engines.py | 23 +++++++++++++++++++++++ 4 files changed, 56 insertions(+), 11 deletions(-) diff --git a/docs/class_reference.md b/docs/class_reference.md index bcace93..172f75f 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -737,7 +737,7 @@ Extends Engine Extends Engine -#### MergeTree(date_col=None, order_by=(), sampling_expr=None, index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None) +#### MergeTree(date_col=None, order_by=(), sampling_expr=None, index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None, primary_key=None) ### Buffer @@ -793,21 +793,21 @@ straightly into Distributed table, optional Extends MergeTree -#### CollapsingMergeTree(date_col=None, order_by=(), sign_col="sign", sampling_expr=None, index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None) +#### CollapsingMergeTree(date_col=None, order_by=(), sign_col="sign", sampling_expr=None, index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None, primary_key=None) ### SummingMergeTree Extends MergeTree -#### SummingMergeTree(date_col=None, order_by=(), summing_cols=None, sampling_expr=None, index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None) +#### SummingMergeTree(date_col=None, order_by=(), summing_cols=None, sampling_expr=None, index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None, primary_key=None) ### ReplacingMergeTree Extends MergeTree -#### ReplacingMergeTree(date_col=None, order_by=(), ver_col=None, sampling_expr=None, index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None) +#### ReplacingMergeTree(date_col=None, order_by=(), ver_col=None, sampling_expr=None, index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None, primary_key=None) infi.clickhouse_orm.query diff --git a/docs/table_engines.md b/docs/table_engines.md index d4ba905..b71e383 100644 --- a/docs/table_engines.md +++ b/docs/table_engines.md @@ -73,6 +73,19 @@ Example: partition_key=('toYYYYMM(EventDate)', 'BannerID')) +### Primary key +ClickHouse supports [custom primary key](https://clickhouse.yandex/docs/en/operations/table_engines/mergetree/#primary-keys-and-indexes-in-queries/) expressions since version 1.1.54310 + +You can use custom primary key with any `MergeTree` family engine. +To set custom partitioning add `primary_key` parameter. It should be a tuple of expressions, by which partitions are built. + +By default primary key is equal to order_by expression + +Example: + + engine = engines.ReplacingMergeTree(order_by=('OrderID', 'EventDate', 'BannerID'), ver_col='Version', + partition_key=('toYYYYMM(EventDate)', 'BannerID'), primary_key=('OrderID',)) + ### Data Replication Any of the above engines can be converted to a replicated engine (e.g. `ReplicatedMergeTree`) by adding two parameters, `replica_table_path` and `replica_name`: diff --git a/src/infi/clickhouse_orm/engines.py b/src/infi/clickhouse_orm/engines.py index e38bfcf..3fb8ed6 100644 --- a/src/infi/clickhouse_orm/engines.py +++ b/src/infi/clickhouse_orm/engines.py @@ -35,8 +35,10 @@ class Memory(Engine): class MergeTree(Engine): def __init__(self, date_col=None, order_by=(), sampling_expr=None, - index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None): + index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None, + primary_key=None): assert type(order_by) in (list, tuple), 'order_by must be a list or tuple' + assert primary_key is None or type(primary_key) in (list, tuple), 'primary_key must be a list or tuple' assert date_col is None or isinstance(date_col, six.string_types), 'date_col must be string if present' assert partition_key is None or type(partition_key) in (list, tuple),\ 'partition_key must be tuple or list if present' @@ -48,6 +50,7 @@ class MergeTree(Engine): assert date_col or partition_key, "You must set either date_col or partition_key" self.date_col = date_col self.partition_key = partition_key if partition_key else ('toYYYYMM(`%s`)' % date_col,) + self.primary_key = primary_key self.order_by = order_by self.sampling_expr = sampling_expr @@ -78,6 +81,9 @@ class MergeTree(Engine): partition_sql = "PARTITION BY %s ORDER BY %s" \ % ('(%s)' % comma_join(self.partition_key), '(%s)' % comma_join(self.order_by)) + if self.primary_key: + partition_sql += " PRIMARY KEY (%s)" % comma_join(self.primary_key) + if self.sampling_expr: partition_sql += " SAMPLE BY %s" % self.sampling_expr @@ -117,9 +123,10 @@ class MergeTree(Engine): class CollapsingMergeTree(MergeTree): def __init__(self, date_col=None, order_by=(), sign_col='sign', sampling_expr=None, - index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None): + index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None, + primary_key=None): super(CollapsingMergeTree, self).__init__(date_col, order_by, sampling_expr, index_granularity, - replica_table_path, replica_name, partition_key) + replica_table_path, replica_name, partition_key, primary_key) self.sign_col = sign_col def _build_sql_params(self, db): @@ -131,9 +138,10 @@ class CollapsingMergeTree(MergeTree): class SummingMergeTree(MergeTree): def __init__(self, date_col=None, order_by=(), summing_cols=None, sampling_expr=None, - index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None): + index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None, + primary_key=None): super(SummingMergeTree, self).__init__(date_col, order_by, sampling_expr, index_granularity, replica_table_path, - replica_name, partition_key) + replica_name, partition_key, primary_key) assert type is None or type(summing_cols) in (list, tuple), 'summing_cols must be a list or tuple' self.summing_cols = summing_cols @@ -147,9 +155,10 @@ class SummingMergeTree(MergeTree): class ReplacingMergeTree(MergeTree): def __init__(self, date_col=None, order_by=(), ver_col=None, sampling_expr=None, - index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None): + index_granularity=8192, replica_table_path=None, replica_name=None, partition_key=None, + primary_key=None): super(ReplacingMergeTree, self).__init__(date_col, order_by, sampling_expr, index_granularity, - replica_table_path, replica_name, partition_key) + replica_table_path, replica_name, partition_key, primary_key) self.ver_col = ver_col def _build_sql_params(self, db): diff --git a/tests/test_engines.py b/tests/test_engines.py index a05d359..2d874a9 100644 --- a/tests/test_engines.py +++ b/tests/test_engines.py @@ -165,6 +165,29 @@ class EnginesTestCase(_EnginesHelperTestCase): self.assertEqual('testmodel', parts[1].table) self.assertEqual('(201701, 13)'.replace(' ', ''), parts[1].partition.replace(' ', '')) + def test_custom_primary_key(self): + class TestModel(SampleModel): + engine = MergeTree( + order_by=('date', 'event_id', 'event_group'), + partition_key=('toYYYYMM(date)',), + primary_key=('date', 'event_id') + ) + + class TestCollapseModel(SampleModel): + sign = Int8Field() + + engine = CollapsingMergeTree( + sign_col='sign', + order_by=('date', 'event_id', 'event_group'), + partition_key=('toYYYYMM(date)',), + primary_key=('date', 'event_id') + ) + + self._create_and_insert(TestModel) + self._create_and_insert(TestCollapseModel) + + self.assertEqual(2, len(list(SystemPart.get(self.database)))) + class SampleModel(Model): From 39a812b1343f8b65c9eef1a4815a358b2a9e28c0 Mon Sep 17 00:00:00 2001 From: romamo Date: Wed, 4 Mar 2020 15:02:43 +0200 Subject: [PATCH 2/8] Fixed ignoring of negative enum values --- src/infi/clickhouse_orm/fields.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 4a4e808..bd2de33 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -414,7 +414,7 @@ class BaseEnumField(Field): import re from enum import Enum members = {} - for match in re.finditer("'(\w+)' = (\d+)", db_type): + for match in re.finditer("'(\w+)' = (-?\d+)", db_type): members[match.group(1)] = int(match.group(2)) enum_cls = Enum('AdHocEnum', members) field_class = Enum8Field if db_type.startswith('Enum8') else Enum16Field From 54c67f27772bc1e1db8d0d4fa7ac0b4a0f45e4e2 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Mon, 16 Mar 2020 10:24:02 +0200 Subject: [PATCH 3/8] Fix tests for ClickHouse v20.3 --- tests/test_database.py | 8 ++++++-- tests/test_join.py | 4 ++-- tests/test_readonly.py | 9 +++++++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/test_database.py b/tests/test_database.py index 0433bff..4911a49 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -149,8 +149,12 @@ class DatabaseTestCase(TestCaseWithData): Database(self.database.db_name, username='default', password='wrong') exc = cm.exception - self.assertEqual(exc.code, 193) - self.assertTrue(exc.message.startswith('Wrong password for user default')) + if exc.code == 193: # ClickHouse version < 20.3 + self.assertTrue(exc.message.startswith('Wrong password for user default')) + elif exc.code == 516: # ClickHouse version >= 20.3 + self.assertTrue(exc.message.startswith('default: Authentication failed')) + else: + raise Exception('Unexpected error code - %s' % exc.code) def test_nonexisting_db(self): db = Database('db_not_here', autocreate=False) diff --git a/tests/test_join.py b/tests/test_join.py index e3d5d12..ce0ce03 100644 --- a/tests/test_join.py +++ b/tests/test_join.py @@ -30,8 +30,8 @@ class JoinTest(unittest.TestCase): self.print_res("SELECT b FROM $db.{} ALL LEFT JOIN $db.{} USING id".format(Foo.table_name(), Bar.table_name())) def test_with_subquery(self): - self.print_res("SELECT b FROM {} ALL LEFT JOIN (SELECT * from {}) USING id".format(Foo.table_name(), Bar.table_name())) - self.print_res("SELECT b FROM $db.{} ALL LEFT JOIN (SELECT * from $db.{}) USING id".format(Foo.table_name(), Bar.table_name())) + self.print_res("SELECT b FROM {} ALL LEFT JOIN (SELECT * from {}) subquery USING id".format(Foo.table_name(), Bar.table_name())) + self.print_res("SELECT b FROM $db.{} ALL LEFT JOIN (SELECT * from $db.{}) subquery USING id".format(Foo.table_name(), Bar.table_name())) class Foo(models.Model): diff --git a/tests/test_readonly.py b/tests/test_readonly.py index 816c701..4361ad7 100644 --- a/tests/test_readonly.py +++ b/tests/test_readonly.py @@ -26,7 +26,9 @@ class ReadonlyTestCase(TestCaseWithData): self.database.drop_database() self._check_db_readonly_err(cm.exception, drop_table=True) except ServerError as e: - if e.code == 192 and e.message.startswith('Unknown user'): + if e.code == 192 and e.message.startswith('Unknown user'): # ClickHouse version < 20.3 + raise unittest.SkipTest('Database user "%s" is not defined' % username) + elif e.code == 516 and e.message.startswith('readonly: Authentication failed'): # ClickHouse version >= 20.3 raise unittest.SkipTest('Database user "%s" is not defined' % username) else: raise @@ -35,7 +37,10 @@ class ReadonlyTestCase(TestCaseWithData): def _check_db_readonly_err(self, exc, drop_table=None): self.assertEqual(exc.code, 164) - if drop_table: + print(exc.message) + if self.database.server_version >= (20, 3): + self.assertTrue(exc.message.startswith('default: Cannot execute query in readonly mode')) + elif drop_table: self.assertTrue(exc.message.startswith('Cannot drop table in readonly mode')) else: self.assertTrue(exc.message.startswith('Cannot insert into table in readonly mode')) From 27eac13767225ead8672862e06895791a71fba5b Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Mon, 16 Mar 2020 10:41:03 +0200 Subject: [PATCH 4/8] Fix tests for ClickHouse v20.3 --- tests/test_readonly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_readonly.py b/tests/test_readonly.py index 4361ad7..f7c8f49 100644 --- a/tests/test_readonly.py +++ b/tests/test_readonly.py @@ -39,7 +39,7 @@ class ReadonlyTestCase(TestCaseWithData): self.assertEqual(exc.code, 164) print(exc.message) if self.database.server_version >= (20, 3): - self.assertTrue(exc.message.startswith('default: Cannot execute query in readonly mode')) + self.assertTrue('Cannot execute query in readonly mode' in exc.message) elif drop_table: self.assertTrue(exc.message.startswith('Cannot drop table in readonly mode')) else: From c27c125d61ec01a36cbdf1236ac753d52b6fd84a Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Fri, 1 May 2020 16:35:39 +0300 Subject: [PATCH 5/8] TRIVIAL update changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bcef883..b117c48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ Change Log ========== +Unreleased +---------- +- Added primary_key parameter to MergeTree engines (M1hacka) + v1.3.0 ------ - Support LowCardinality columns in ad-hoc queries From d1ba26af204428fce463fe6a7bfd84f2f131037b Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Fri, 1 May 2020 16:44:20 +0300 Subject: [PATCH 6/8] TRIVIAL test negative enum values --- tests/test_enum_fields.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/test_enum_fields.py b/tests/test_enum_fields.py index 0b74148..34cd3d0 100644 --- a/tests/test_enum_fields.py +++ b/tests/test_enum_fields.py @@ -22,29 +22,35 @@ class EnumFieldsTest(unittest.TestCase): def test_insert_and_select(self): self.database.insert([ ModelWithEnum(date_field='2016-08-30', enum_field=Fruit.apple), - ModelWithEnum(date_field='2016-08-31', enum_field=Fruit.orange) + ModelWithEnum(date_field='2016-08-31', enum_field=Fruit.orange), + ModelWithEnum(date_field='2016-08-31', enum_field=Fruit.cherry) ]) query = 'SELECT * from $table ORDER BY date_field' results = list(self.database.select(query, ModelWithEnum)) - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 3) self.assertEqual(results[0].enum_field, Fruit.apple) self.assertEqual(results[1].enum_field, Fruit.orange) + self.assertEqual(results[2].enum_field, Fruit.cherry) def test_ad_hoc_model(self): self.database.insert([ ModelWithEnum(date_field='2016-08-30', enum_field=Fruit.apple), - ModelWithEnum(date_field='2016-08-31', enum_field=Fruit.orange) + ModelWithEnum(date_field='2016-08-31', enum_field=Fruit.orange), + ModelWithEnum(date_field='2016-08-31', enum_field=Fruit.cherry) ]) query = 'SELECT * from $db.modelwithenum ORDER BY date_field' results = list(self.database.select(query)) - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 3) self.assertEqual(results[0].enum_field.name, Fruit.apple.name) self.assertEqual(results[0].enum_field.value, Fruit.apple.value) self.assertEqual(results[1].enum_field.name, Fruit.orange.name) self.assertEqual(results[1].enum_field.value, Fruit.orange.value) + self.assertEqual(results[2].enum_field.name, Fruit.cherry.name) + self.assertEqual(results[2].enum_field.value, Fruit.cherry.value) def test_conversion(self): self.assertEqual(ModelWithEnum(enum_field=3).enum_field, Fruit.orange) + self.assertEqual(ModelWithEnum(enum_field=-7).enum_field, Fruit.cherry) self.assertEqual(ModelWithEnum(enum_field='apple').enum_field, Fruit.apple) self.assertEqual(ModelWithEnum(enum_field=Fruit.banana).enum_field, Fruit.banana) @@ -66,7 +72,7 @@ class EnumFieldsTest(unittest.TestCase): self.assertEqual(results[0].enum_array, instance.enum_array) -Fruit = Enum('Fruit', u'apple banana orange') +Fruit = Enum('Fruit', [('apple', 1), ('banana', 2), ('orange', 3), ('cherry', -7)]) class ModelWithEnum(Model): @@ -83,3 +89,4 @@ class ModelWithEnumArray(Model): enum_array = ArrayField(Enum16Field(Fruit)) engine = MergeTree('date_field', ('date_field',)) + From 3019647339a588186da745bdcbbb194502dc5b46 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Fri, 1 May 2020 16:44:54 +0300 Subject: [PATCH 7/8] TRIVIAL update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b117c48..67fc188 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Change Log Unreleased ---------- - Added primary_key parameter to MergeTree engines (M1hacka) +- Support negative enum values (Romamo) v1.3.0 ------ From fcb28b8c9f9dc6f7e5571d7fb97d9a9cea4ebf43 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sat, 16 May 2020 09:26:43 +0300 Subject: [PATCH 8/8] Releasing v1.4.0 --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 67fc188..cb498d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ Change Log ========== -Unreleased ----------- +v1.4.0 +------ - Added primary_key parameter to MergeTree engines (M1hacka) - Support negative enum values (Romamo)