From 4d2ebd65fb1686592c7501f3c86288bd2c4e65a8 Mon Sep 17 00:00:00 2001 From: Roy Belio Date: Thu, 20 Jun 2019 11:21:43 +0300 Subject: [PATCH 1/9] RAMEN-208 Support codec compression for clickhouse --- docs/field_types.md | 39 ++++++++ src/infi/clickhouse_orm/database.py | 2 + src/infi/clickhouse_orm/fields.py | 58 +++++++----- src/infi/clickhouse_orm/migrations.py | 4 +- src/infi/clickhouse_orm/models.py | 4 +- tests/sample_migrations/0015.py | 6 ++ tests/test_compressed_fields.py | 123 ++++++++++++++++++++++++++ tests/test_migrations.py | 13 +++ 8 files changed, 222 insertions(+), 27 deletions(-) create mode 100644 tests/sample_migrations/0015.py create mode 100644 tests/test_compressed_fields.py diff --git a/docs/field_types.md b/docs/field_types.md index 5a28c65..2636c5b 100644 --- a/docs/field_types.md +++ b/docs/field_types.md @@ -148,6 +148,45 @@ to `None`. NOTE: `ArrayField` of `NullableField` is not supported. Also `EnumField` cannot be nullable. +Working with field compression codecs +------------------------------------- +Besides default data compression, defined in server settings, per-field specification is also available. + +Supported compression algorithms: + +| Codec | Argument | Comment +| -------------------- | -------------------------------------------| ---------------------------------------------------- +| NONE | None | No compression. +| LZ4 | None | LZ4 compression. +| LZ4HC(`level`) | Possible `level` range: [3, 12]. | Default value: 9. Greater values stands for better compression and higher CPU usage. Recommended value range: [4,9]. +| ZSTD(`level`) | Possible `level`range: [1, 22]. | Default value: 1. Greater values stands for better compression and higher CPU usage. Levels >= 20, should be used with caution, as they require more memory. +| Delta(`delta_bytes`) | Possible `delta_bytes` range: 1, 2, 4 , 8. | Default value for `delta_bytes` is `sizeof(type)` if it is equal to 1, 2,4 or 8 and equals to 1 otherwise. + +Codecs can be combined in a pipeline. Default table codec is not included into pipeline (if it should be applied to a field, you have to specify it explicitly in pipeline). + +Recommended usage for codecs: +- Usually, values for particular metric, stored in path does not differ significantly from point to point. Using delta-encoding allows to reduce disk space usage significantly. +- DateTime works great with pipeline of Delta, ZSTD and the column size can be compressed to 2-3% of its original size (given a smooth datetime data) +- Numeric types usually enjoy best compression rates with ZSTD +- String types enjoy good compression rates with LZ4HC + +Usage: +```python +class Stats(models.Model): + + id = fields.UInt64Field(codec='ZSTD(10)') + timestamp = fields.DateTimeField(codec='Delta,ZSTD') + timestamp_date = fields.DateField(codec='Delta(4),ZSTD(22)') + metadata_id = fields.Int64Field(codec='LZ4') + status = fields.StringField(codec='LZ4HC(10)') + calculation = fields.NullableField(fields.Float32Field(), codec='ZSTD') + alerts = fields.ArrayField(fields.FixedStringField(length=15), codec='Delta(2),LZ4HC') + + engine = MergeTree('timestamp_date', ('id', 'timestamp')) + +``` +:exclamation:**_This feature is supported on clickhouse version 19.1.16 and above, codec arguments will be ignored by the ORM for clickhouse versions lower than 19.1.16_** + Creating custom field types --------------------------- Sometimes it is convenient to use data types that are supported in Python, but have no corresponding column type in ClickHouse. In these cases it is possible to define a custom field class that knows how to convert the Pythonic object to a suitable representation in the database, and vice versa. diff --git a/src/infi/clickhouse_orm/database.py b/src/infi/clickhouse_orm/database.py index 8aaae47..dba7978 100644 --- a/src/infi/clickhouse_orm/database.py +++ b/src/infi/clickhouse_orm/database.py @@ -120,6 +120,8 @@ class Database(object): self.server_version = self._get_server_version() # Versions 1.1.53981 and below don't have timezone function self.server_timezone = self._get_server_timezone() if self.server_version > (1, 1, 53981) else pytz.utc + # Versions 19.1.16 and below don't support codec compression + self.has_codec_support = self.server_version >= (19, 1, 16) def create_database(self): ''' diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 25ae554..4c67325 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -19,7 +19,7 @@ class Field(object): class_default = 0 db_type = None - def __init__(self, default=None, alias=None, materialized=None, readonly=None): + def __init__(self, default=None, alias=None, materialized=None, readonly=None, codec=None): assert (None, None) in {(default, alias), (alias, materialized), (default, materialized)}, \ "Only one of default, alias and materialized parameters can be given" assert alias is None or isinstance(alias, string_types) and alias != "",\ @@ -27,6 +27,8 @@ class Field(object): assert materialized is None or isinstance(materialized, string_types) and alias != "",\ "Materialized field must be string, if given" assert readonly is None or type(readonly) is bool, "readonly parameter must be bool if given" + assert codec is None or isinstance(codec, string_types) and codec != "", \ + "Codec field must be string, if given" self.creation_counter = Field.creation_counter Field.creation_counter += 1 @@ -34,6 +36,7 @@ class Field(object): self.alias = alias self.materialized = materialized self.readonly = bool(self.alias or self.materialized or readonly) + self.codec = codec def to_python(self, value, timezone_in_use): ''' @@ -64,22 +67,25 @@ class Field(object): ''' return escape(value, quote) - def get_sql(self, with_default_expression=True): + def get_sql(self, with_default_expression=True, db=None): ''' Returns an SQL expression describing the field (e.g. for CREATE TABLE). :param with_default_expression: If True, adds default value to sql. It doesn't affect fields with alias and materialized values. + :param db: Database, used for checking supported features. ''' + sql = self.db_type if with_default_expression: if self.alias: - return '%s ALIAS %s' % (self.db_type, self.alias) + sql += ' ALIAS %s' % self.alias elif self.materialized: - return '%s MATERIALIZED %s' % (self.db_type, self.materialized) + sql += ' MATERIALIZED %s' % self.materialized else: default = self.to_db_string(self.default) - return '%s DEFAULT %s' % (self.db_type, default) - else: - return self.db_type + sql += ' DEFAULT %s' % default + if self.codec and db and db.has_codec_support: + sql+= ' CODEC(%s)' % self.codec + return sql def isinstance(self, types): """ @@ -361,11 +367,11 @@ class BaseEnumField(Field): Abstract base class for all enum-type fields. ''' - def __init__(self, enum_cls, default=None, alias=None, materialized=None, readonly=None): + def __init__(self, enum_cls, default=None, alias=None, materialized=None, readonly=None, codec=None): self.enum_cls = enum_cls if default is None: default = list(enum_cls)[0] - super(BaseEnumField, self).__init__(default, alias, materialized, readonly) + super(BaseEnumField, self).__init__(default, alias, materialized, readonly, codec) def to_python(self, value, timezone_in_use): if isinstance(value, self.enum_cls): @@ -384,12 +390,14 @@ class BaseEnumField(Field): def to_db_string(self, value, quote=True): return escape(value.name, quote) - def get_sql(self, with_default_expression=True): + def get_sql(self, with_default_expression=True, db=None): values = ['%s = %d' % (escape(item.name), item.value) for item in self.enum_cls] sql = '%s(%s)' % (self.db_type, ' ,'.join(values)) if with_default_expression: default = self.to_db_string(self.default) sql = '%s DEFAULT %s' % (sql, default) + if self.codec and db and db.has_codec_support: + sql+= ' CODEC(%s)' % self.codec return sql @classmethod @@ -425,11 +433,11 @@ class ArrayField(Field): class_default = [] - def __init__(self, inner_field, default=None, alias=None, materialized=None, readonly=None): + def __init__(self, inner_field, default=None, alias=None, materialized=None, readonly=None, codec=None): assert isinstance(inner_field, Field), "The first argument of ArrayField must be a Field instance" assert not isinstance(inner_field, ArrayField), "Multidimensional array fields are not supported by the ORM" self.inner_field = inner_field - super(ArrayField, self).__init__(default, alias, materialized, readonly) + super(ArrayField, self).__init__(default, alias, materialized, readonly, codec) def to_python(self, value, timezone_in_use): if isinstance(value, text_type): @@ -448,9 +456,11 @@ class ArrayField(Field): array = [self.inner_field.to_db_string(v, quote=True) for v in value] return '[' + comma_join(array) + ']' - def get_sql(self, with_default_expression=True): - from .utils import escape - return 'Array(%s)' % self.inner_field.get_sql(with_default_expression=False) + def get_sql(self, with_default_expression=True, db=None): + sql = 'Array(%s)' % self.inner_field.get_sql(with_default_expression=False) + if self.codec and db and db.has_codec_support: + sql+= ' CODEC(%s)' % self.codec + return sql class UUIDField(Field): @@ -481,12 +491,12 @@ class NullableField(Field): class_default = None def __init__(self, inner_field, default=None, alias=None, materialized=None, - extra_null_values=None): + extra_null_values=None, codec=None): self.inner_field = inner_field self._null_values = [None] if extra_null_values: self._null_values.extend(extra_null_values) - super(NullableField, self).__init__(default, alias, materialized, readonly=None) + super(NullableField, self).__init__(default, alias, materialized, readonly=None, codec=codec) def to_python(self, value, timezone_in_use): if value == '\\N' or value in self._null_values: @@ -501,14 +511,16 @@ class NullableField(Field): return '\\N' return self.inner_field.to_db_string(value, quote=quote) - def get_sql(self, with_default_expression=True): - s = 'Nullable(%s)' % self.inner_field.get_sql(with_default_expression=False) + def get_sql(self, with_default_expression=True, db=None): + sql = 'Nullable(%s)' % self.inner_field.get_sql(with_default_expression=False) if with_default_expression: if self.alias: - s = '%s ALIAS %s' % (s, self.alias) + sql += ' ALIAS %s' % self.alias elif self.materialized: - s = '%s MATERIALIZED %s' % (s, self.materialized) + sql += ' MATERIALIZED %s' % self.materialized elif self.default: default = self.to_db_string(self.default) - s = '%s DEFAULT %s' % (s, default) - return s + sql += ' DEFAULT %s' % default + if self.codec and db and db.has_codec_support: + sql+= ' CODEC(%s)' % self.codec + return sql diff --git a/src/infi/clickhouse_orm/migrations.py b/src/infi/clickhouse_orm/migrations.py index 7e34430..55622a9 100644 --- a/src/infi/clickhouse_orm/migrations.py +++ b/src/infi/clickhouse_orm/migrations.py @@ -79,7 +79,7 @@ class AlterTable(Operation): if name not in table_fields: logger.info(' Add column %s', name) assert prev_name, 'Cannot add a column to the beginning of the table' - cmd = 'ADD COLUMN %s %s' % (name, field.get_sql()) + cmd = 'ADD COLUMN %s %s' % (name, field.get_sql(db=database)) if is_regular_field: cmd += ' AFTER %s' % prev_name self._alter_table(database, cmd) @@ -93,7 +93,7 @@ class AlterTable(Operation): # The order of class attributes can be changed any time, so we can't count on it # Secondly, MATERIALIZED and ALIAS fields are always at the end of the DESC, so we can't expect them to save # attribute position. Watch https://github.com/Infinidat/infi.clickhouse_orm/issues/47 - model_fields = {name: field.get_sql(with_default_expression=False) + model_fields = {name: field.get_sql(with_default_expression=False, db=database) for name, field in iteritems(self.model_class.fields())} for field_name, field_sql in self._get_table_fields(database): # All fields must have been created and dropped by this moment diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index beae53f..5f8b085 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -190,7 +190,7 @@ class Model(with_metaclass(ModelBase)): parts = ['CREATE TABLE IF NOT EXISTS `%s`.`%s` (' % (db.db_name, cls.table_name())] cols = [] for name, field in iteritems(cls.fields()): - cols.append(' %s %s' % (name, field.get_sql())) + cols.append(' %s %s' % (name, field.get_sql(db=db))) parts.append(',\n'.join(cols)) parts.append(')') parts.append('ENGINE = ' + cls.engine.create_table_sql(db)) @@ -316,7 +316,7 @@ class MergeModel(Model): cols = [] for name, field in iteritems(cls.fields()): if name != '_table': - cols.append(' %s %s' % (name, field.get_sql())) + cols.append(' %s %s' % (name, field.get_sql(db=db))) parts.append(',\n'.join(cols)) parts.append(')') parts.append('ENGINE = ' + cls.engine.create_table_sql(db)) diff --git a/tests/sample_migrations/0015.py b/tests/sample_migrations/0015.py new file mode 100644 index 0000000..c078e28 --- /dev/null +++ b/tests/sample_migrations/0015.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.AlterTableWithBuffer(Model4_compressed) +] diff --git a/tests/test_compressed_fields.py b/tests/test_compressed_fields.py new file mode 100644 index 0000000..5bb3282 --- /dev/null +++ b/tests/test_compressed_fields.py @@ -0,0 +1,123 @@ +from __future__ import unicode_literals +import unittest +import datetime +import pytz + +from infi.clickhouse_orm.database import Database +from infi.clickhouse_orm.models import Model +from infi.clickhouse_orm.fields import * +from infi.clickhouse_orm.engines import * +from infi.clickhouse_orm.utils import parse_tsv + + +class CompressedFieldsTestCase(unittest.TestCase): + + def setUp(self): + self.database = Database('test-db', log_statements=True) + self.database.create_table(CompressedModel) + + def tearDown(self): + self.database.drop_database() + + def test_defaults(self): + # Check that all fields have their explicit or implicit defaults + instance = CompressedModel() + self.database.insert([instance]) + self.assertEqual(instance.date_field, datetime.date(1970, 1, 1)) + self.assertEqual(instance.datetime_field, datetime.datetime(1970, 1, 1, tzinfo=pytz.utc)) + self.assertEqual(instance.string_field, 'dozo') + self.assertEqual(instance.int64_field, 42) + self.assertEqual(instance.float_field, 0) + self.assertEqual(instance.nullable_field, None) + self.assertEqual(instance.array_field, []) + + def test_assignment(self): + # Check that all fields are assigned during construction + kwargs = dict( + uint64_field=217, + date_field=datetime.date(1973, 12, 6), + datetime_field=datetime.datetime(2000, 5, 24, 10, 22, tzinfo=pytz.utc), + string_field='aloha', + int64_field=-50, + float_field=3.14, + nullable_field=-2.718281, + array_field=['123456789123456','','a'] + ) + instance = CompressedModel(**kwargs) + self.database.insert([instance]) + for name, value in kwargs.items(): + self.assertEqual(kwargs[name], getattr(instance, name)) + + def test_string_conversion(self): + # Check field conversion from string during construction + instance = CompressedModel(date_field='1973-12-06', int64_field='100', float_field='7', nullable_field=None, array_field='[a,b,c]') + self.assertEqual(instance.date_field, datetime.date(1973, 12, 6)) + self.assertEqual(instance.int64_field, 100) + self.assertEqual(instance.float_field, 7) + self.assertEqual(instance.nullable_field, None) + self.assertEqual(instance.array_field, ['a', 'b', 'c']) + # Check field conversion from string during assignment + instance.int64_field = '99' + self.assertEqual(instance.int64_field, 99) + + def test_to_dict(self): + instance = CompressedModel(date_field='1973-12-06', int64_field='100', float_field='7', array_field='[a,b,c]') + self.assertDictEqual(instance.to_dict(), { + "date_field": datetime.date(1973, 12, 6), + "int64_field": 100, + "float_field": 7.0, + "datetime_field": datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc), + "alias_field": 0.0, + 'string_field': 'dozo', + 'nullable_field': None, + 'uint64_field': 0, + 'array_field': ['a','b','c'] + }) + self.assertDictEqual(instance.to_dict(include_readonly=False), { + "date_field": datetime.date(1973, 12, 6), + "int64_field": 100, + "float_field": 7.0, + "datetime_field": datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc), + 'string_field': 'dozo', + 'nullable_field': None, + 'uint64_field': 0, + 'array_field': ['a', 'b', 'c'] + }) + self.assertDictEqual( + instance.to_dict(include_readonly=False, field_names=('int64_field', 'alias_field', 'datetime_field')), { + "int64_field": 100, + "datetime_field": datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc) + }) + + # This test will fail on clickhouse version < 19.1.16, use skip test + def test_confirm_compression_codec(self): + instance = CompressedModel(date_field='1973-12-06', int64_field='100', float_field='7', array_field='[a,b,c]') + self.database.insert([instance]) + r = self.database.raw("select name, compression_codec from system.columns where table = '{}' and database='{}' FORMAT TabSeparatedWithNamesAndTypes".format(instance.table_name(), self.database.db_name)) + lines = r.splitlines() + field_names = parse_tsv(lines[0]) + field_types = parse_tsv(lines[1]) + data = [tuple(parse_tsv(line)) for line in lines[2:]] + self.assertListEqual(data, [('uint64_field', 'CODEC(ZSTD(10))'), + ('datetime_field', 'CODEC(Delta(4), ZSTD(1))'), + ('date_field', 'CODEC(Delta(4), ZSTD(22))'), + ('int64_field', 'CODEC(LZ4)'), + ('string_field', 'CODEC(LZ4HC(10))'), + ('nullable_field', 'CODEC(ZSTD(1))'), + ('array_field', 'CODEC(Delta(2), LZ4HC(0))'), + ('float_field', 'CODEC(NONE)'), + ('alias_field', 'CODEC(ZSTD(4))')]) + + +class CompressedModel(Model): + uint64_field = UInt64Field(codec='ZSTD(10)') + datetime_field = DateTimeField(codec='Delta,ZSTD') + date_field = DateField(codec='Delta(4),ZSTD(22)') + int64_field = Int64Field(default=42, codec='LZ4') + string_field = StringField(default='dozo', codec='LZ4HC(10)') + nullable_field = NullableField(Float32Field(), codec='ZSTD') + array_field = ArrayField(FixedStringField(length=15), codec='Delta(2),LZ4HC') + float_field = Float32Field(codec='NONE') + alias_field = Float32Field(alias='float_field', codec='ZSTD(4)') + + engine = MergeTree('datetime_field', ('uint64_field', 'datetime_field')) \ No newline at end of file diff --git a/tests/test_migrations.py b/tests/test_migrations.py index d84450f..c6ee5ce 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -258,3 +258,16 @@ class Model4Buffer_changed(BufferModel, Model4_changed): @classmethod def table_name(cls): return 'model4buffer' + + +class Model4_compressed(Model): + + date = DateField(codec='Delta(4),ZSTD') + f3 = DateTimeField(codec='Delta,ZSTD(10)') + f2 = StringField(codec='LZ4HC') + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'model4' \ No newline at end of file From 1889ac6372be85b697d1b0d17dbacc4a33d242a9 Mon Sep 17 00:00:00 2001 From: Roy Belio Date: Sun, 23 Jun 2019 11:53:58 +0300 Subject: [PATCH 2/9] Minor bug fixes field creation won't allow empty string materialized field. repliaca_name check is none fix enum usage typos fix --- src/infi/clickhouse_orm/engines.py | 2 +- src/infi/clickhouse_orm/fields.py | 10 +++------- tests/test_enum_fields.py | 5 +---- tests/test_migrations.py | 5 +---- tests/test_querysets.py | 8 ++------ 5 files changed, 8 insertions(+), 22 deletions(-) diff --git a/src/infi/clickhouse_orm/engines.py b/src/infi/clickhouse_orm/engines.py index ea6d3f4..e38bfcf 100644 --- a/src/infi/clickhouse_orm/engines.py +++ b/src/infi/clickhouse_orm/engines.py @@ -40,7 +40,7 @@ class MergeTree(Engine): assert date_col is None or isinstance(date_col, six.string_types), 'date_col must be string if present' assert partition_key is None or type(partition_key) in (list, tuple),\ 'partition_key must be tuple or list if present' - assert (replica_table_path is None) == (replica_name == None), \ + assert (replica_table_path is None) == (replica_name is None), \ 'both replica_table_path and replica_name must be specified' # These values conflict with each other (old and new syntax of table engines. diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 4c67325..de1d0d7 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -3,7 +3,6 @@ from six import string_types, text_type, binary_type, integer_types import datetime import iso8601 import pytz -import time from calendar import timegm from decimal import Decimal, localcontext from uuid import UUID @@ -23,8 +22,8 @@ class Field(object): assert (None, None) in {(default, alias), (alias, materialized), (default, materialized)}, \ "Only one of default, alias and materialized parameters can be given" assert alias is None or isinstance(alias, string_types) and alias != "",\ - "Alias field must be string field name, if given" - assert materialized is None or isinstance(materialized, string_types) and alias != "",\ + "Alias field must be a string, if given" + assert materialized is None or isinstance(materialized, string_types) and materialized != "",\ "Materialized field must be string, if given" assert readonly is None or type(readonly) is bool, "readonly parameter must be bool if given" assert codec is None or isinstance(codec, string_types) and codec != "", \ @@ -407,10 +406,7 @@ class BaseEnumField(Field): this method returns a matching enum field. ''' import re - try: - Enum # exists in Python 3.4+ - except NameError: - from enum import Enum # use the enum34 library instead + from enum import Enum members = {} for match in re.finditer("'(\w+)' = (\d+)", db_type): members[match.group(1)] = int(match.group(2)) diff --git a/tests/test_enum_fields.py b/tests/test_enum_fields.py index c6b7b1f..0b74148 100644 --- a/tests/test_enum_fields.py +++ b/tests/test_enum_fields.py @@ -6,10 +6,7 @@ from infi.clickhouse_orm.models import Model from infi.clickhouse_orm.fields import * from infi.clickhouse_orm.engines import * -try: - Enum # exists in Python 3.4+ -except NameError: - from enum import Enum # use the enum34 library instead +from enum import Enum class EnumFieldsTest(unittest.TestCase): diff --git a/tests/test_migrations.py b/tests/test_migrations.py index c6ee5ce..19f0ee1 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -7,14 +7,11 @@ from infi.clickhouse_orm.fields import * from infi.clickhouse_orm.engines import * from infi.clickhouse_orm.migrations import MigrationHistory +from enum import Enum # Add tests to path so that migrations will be importable import sys, os sys.path.append(os.path.dirname(__file__)) -try: - Enum # exists in Python 3.4+ -except NameError: - from enum import Enum # use the enum34 library instead import logging logging.basicConfig(level=logging.DEBUG, format='%(message)s') diff --git a/tests/test_querysets.py b/tests/test_querysets.py index ea3d73c..b17933b 100644 --- a/tests/test_querysets.py +++ b/tests/test_querysets.py @@ -7,11 +7,7 @@ from infi.clickhouse_orm.query import Q from .base_test_with_data import * import logging from datetime import date, datetime - -try: - Enum # exists in Python 3.4+ -except NameError: - from enum import Enum # use the enum34 library instead +from enum import Enum class QuerySetTestCase(TestCaseWithData): @@ -227,7 +223,7 @@ class QuerySetTestCase(TestCaseWithData): qs = Person.objects_in(self.database).order_by('first_name', 'last_name') # Try different page sizes for page_size in (1, 2, 7, 10, 30, 100, 150): - # Iterate over pages and collect all intances + # Iterate over pages and collect all instances page_num = 1 instances = set() while True: From 2d3441b1270f7b91960f51907424d984700cba99 Mon Sep 17 00:00:00 2001 From: Roy Belio Date: Mon, 24 Jun 2019 12:31:19 +0300 Subject: [PATCH 3/9] RAMEN-208 Support codec compression for clickhouse --- src/infi/clickhouse_orm/database.py | 2 +- src/infi/clickhouse_orm/fields.py | 18 +++++++++--------- tests/sample_migrations/0015.py | 2 +- tests/test_migrations.py | 4 +++- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/infi/clickhouse_orm/database.py b/src/infi/clickhouse_orm/database.py index dba7978..939a12d 100644 --- a/src/infi/clickhouse_orm/database.py +++ b/src/infi/clickhouse_orm/database.py @@ -120,7 +120,7 @@ class Database(object): self.server_version = self._get_server_version() # Versions 1.1.53981 and below don't have timezone function self.server_timezone = self._get_server_timezone() if self.server_version > (1, 1, 53981) else pytz.utc - # Versions 19.1.16 and below don't support codec compression + # Versions 19.1.16 and above support codec compression self.has_codec_support = self.server_version >= (19, 1, 16) def create_database(self): diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index de1d0d7..ae47efd 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -82,8 +82,8 @@ class Field(object): else: default = self.to_db_string(self.default) sql += ' DEFAULT %s' % default - if self.codec and db and db.has_codec_support: - sql+= ' CODEC(%s)' % self.codec + if self.codec and db and db.has_codec_support: + sql+= ' CODEC(%s)' % self.codec return sql def isinstance(self, types): @@ -395,8 +395,8 @@ class BaseEnumField(Field): if with_default_expression: default = self.to_db_string(self.default) sql = '%s DEFAULT %s' % (sql, default) - if self.codec and db and db.has_codec_support: - sql+= ' CODEC(%s)' % self.codec + if self.codec and db and db.has_codec_support: + sql+= ' CODEC(%s)' % self.codec return sql @classmethod @@ -453,8 +453,8 @@ class ArrayField(Field): return '[' + comma_join(array) + ']' def get_sql(self, with_default_expression=True, db=None): - sql = 'Array(%s)' % self.inner_field.get_sql(with_default_expression=False) - if self.codec and db and db.has_codec_support: + sql = 'Array(%s)' % self.inner_field.get_sql(with_default_expression=False, db=db) + if with_default_expression and self.codec and db and db.has_codec_support: sql+= ' CODEC(%s)' % self.codec return sql @@ -508,7 +508,7 @@ class NullableField(Field): return self.inner_field.to_db_string(value, quote=quote) def get_sql(self, with_default_expression=True, db=None): - sql = 'Nullable(%s)' % self.inner_field.get_sql(with_default_expression=False) + sql = 'Nullable(%s)' % self.inner_field.get_sql(with_default_expression=False, db=db) if with_default_expression: if self.alias: sql += ' ALIAS %s' % self.alias @@ -517,6 +517,6 @@ class NullableField(Field): elif self.default: default = self.to_db_string(self.default) sql += ' DEFAULT %s' % default - if self.codec and db and db.has_codec_support: - sql+= ' CODEC(%s)' % self.codec + if self.codec and db and db.has_codec_support: + sql+= ' CODEC(%s)' % self.codec return sql diff --git a/tests/sample_migrations/0015.py b/tests/sample_migrations/0015.py index c078e28..1ab9b5b 100644 --- a/tests/sample_migrations/0015.py +++ b/tests/sample_migrations/0015.py @@ -2,5 +2,5 @@ from infi.clickhouse_orm import migrations from ..test_migrations import * operations = [ - migrations.AlterTableWithBuffer(Model4_compressed) + migrations.AlterTable(Model4_compressed), ] diff --git a/tests/test_migrations.py b/tests/test_migrations.py index 19f0ee1..6c1af1a 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -94,6 +94,8 @@ class MigrationsTestCase(unittest.TestCase): self.assertTrue(self.tableExists(AliasModel1)) self.assertEqual(self.getTableFields(AliasModel1), [('date', 'Date'), ('int_field', 'Int8'), ('date_alias', 'Date'), ('int_field_plus_one', 'Int8')]) + self.database.migrate('tests.sample_migrations', 15) + self.assertTrue(self.tableExists(Model4_compressed)) # Several different models with the same table name, to simulate a table that changes over time @@ -259,7 +261,7 @@ class Model4Buffer_changed(BufferModel, Model4_changed): class Model4_compressed(Model): - date = DateField(codec='Delta(4),ZSTD') + date = DateField() f3 = DateTimeField(codec='Delta,ZSTD(10)') f2 = StringField(codec='LZ4HC') From 3ba44608f3bc48626baa2ce6c1668e504ccb66fe Mon Sep 17 00:00:00 2001 From: Roy Belio Date: Mon, 24 Jun 2019 14:20:18 +0300 Subject: [PATCH 4/9] RAMEN-206 Support LowCardinality in infi.clickhouse_orm --- src/infi/clickhouse_orm/database.py | 2 ++ src/infi/clickhouse_orm/fields.py | 41 ++++++++++++++++++++++++++++- tests/sample_migrations/0015.py | 1 + tests/test_migrations.py | 26 +++++++++++++++++- 4 files changed, 68 insertions(+), 2 deletions(-) diff --git a/src/infi/clickhouse_orm/database.py b/src/infi/clickhouse_orm/database.py index 939a12d..ae47bc0 100644 --- a/src/infi/clickhouse_orm/database.py +++ b/src/infi/clickhouse_orm/database.py @@ -122,6 +122,8 @@ class Database(object): self.server_timezone = self._get_server_timezone() if self.server_version > (1, 1, 53981) else pytz.utc # Versions 19.1.16 and above support codec compression self.has_codec_support = self.server_version >= (19, 1, 16) + # Version 19.0 and above support LowCardinality + self.has_low_cardinality_support = self.server_version >= (19, 0) def create_database(self): ''' diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index ae47efd..8b6d9f7 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -6,9 +6,10 @@ import pytz from calendar import timegm from decimal import Decimal, localcontext from uuid import UUID - +from logging import getLogger from .utils import escape, parse_array, comma_join +logger = getLogger('clickhouse_orm') class Field(object): ''' @@ -520,3 +521,41 @@ class NullableField(Field): if self.codec and db and db.has_codec_support: sql+= ' CODEC(%s)' % self.codec return sql + + +class LowCardinalityField(Field): + + def __init__(self, inner_field, default=None, alias=None, materialized=None, readonly=None, codec=None): + assert isinstance(inner_field, Field), "The first argument of LowCardinalityField must be a Field instance. Not: {}".format(inner_field) + assert not isinstance(inner_field, LowCardinalityField), "LowCardinality inner fields are not supported by the ORM" + assert not isinstance(inner_field, ArrayField), "Array field inside LowCardinality are not supported by the ORM. Use Array(LowCardinality) instead" + self.inner_field = inner_field + self.class_default = self.inner_field.class_default + super(LowCardinalityField, self).__init__(default, alias, materialized, readonly, codec) + + def to_python(self, value, timezone_in_use): + return self.inner_field.to_python(value, timezone_in_use) + + def validate(self, value): + self.inner_field.validate(value) + + def to_db_string(self, value, quote=True): + return self.inner_field.to_db_string(value, quote=quote) + + def get_sql(self, with_default_expression=True, db=None): + if db and db.has_low_cardinality_support: + sql = 'LowCardinality(%s)' % self.inner_field.get_sql(with_default_expression=False) + else: + sql = self.inner_field.get_sql(with_default_expression=False) + logger.warning('LowCardinalityField not supported on clickhouse-server version < 19.0 using {} as fallback'.format(self.inner_field.__class__.__name__)) + if with_default_expression: + if self.alias: + sql += ' ALIAS %s' % self.alias + elif self.materialized: + sql += ' MATERIALIZED %s' % self.materialized + elif self.default: + default = self.to_db_string(self.default) + sql += ' DEFAULT %s' % default + if self.codec and db and db.has_codec_support: + sql+= ' CODEC(%s)' % self.codec + return sql diff --git a/tests/sample_migrations/0015.py b/tests/sample_migrations/0015.py index 1ab9b5b..be1d378 100644 --- a/tests/sample_migrations/0015.py +++ b/tests/sample_migrations/0015.py @@ -3,4 +3,5 @@ from ..test_migrations import * operations = [ migrations.AlterTable(Model4_compressed), + migrations.AlterTable(Model2LowCardinality) ] diff --git a/tests/test_migrations.py b/tests/test_migrations.py index 6c1af1a..f92b1e9 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -96,6 +96,15 @@ class MigrationsTestCase(unittest.TestCase): [('date', 'Date'), ('int_field', 'Int8'), ('date_alias', 'Date'), ('int_field_plus_one', 'Int8')]) self.database.migrate('tests.sample_migrations', 15) self.assertTrue(self.tableExists(Model4_compressed)) + if self.database.has_low_cardinality_support: + self.assertEqual(self.getTableFields(Model2LowCardinality), + [('date', 'Date'), ('f1', 'LowCardinality(Int32)'), ('f3', 'LowCardinality(Float32)'), + ('f2', 'LowCardinality(String)'), ('f4', 'LowCardinality(Nullable(String))'), ('f5', 'Array(LowCardinality(UInt64))')]) + else: + logging.warning('No support for low cardinality') + self.assertEqual(self.getTableFields(Model2), + [('date', 'Date'), ('f1', 'Int32'), ('f3', 'Float32'), ('f2', 'String'), ('f4', 'Nullable(String)'), + ('f5', 'Array(UInt64)')]) # Several different models with the same table name, to simulate a table that changes over time @@ -269,4 +278,19 @@ class Model4_compressed(Model): @classmethod def table_name(cls): - return 'model4' \ No newline at end of file + return 'model4' + + +class Model2LowCardinality(Model): + date = DateField() + f1 = LowCardinalityField(Int32Field()) + f3 = LowCardinalityField(Float32Field()) + f2 = LowCardinalityField(StringField()) + f4 = LowCardinalityField(NullableField(StringField())) + f5 = ArrayField(LowCardinalityField(UInt64Field())) + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'mig' From bcc4c29d107910c0b2f63c05f8d73d64495e3b0c Mon Sep 17 00:00:00 2001 From: Roy Belio Date: Mon, 24 Jun 2019 18:54:55 +0300 Subject: [PATCH 5/9] Drying get_sql methods Some updates to Nullable documentation --- docs/field_types.md | 5 ++-- src/infi/clickhouse_orm/fields.py | 43 ++++++++++++------------------- 2 files changed, 19 insertions(+), 29 deletions(-) diff --git a/docs/field_types.md b/docs/field_types.md index 2636c5b..73437c0 100644 --- a/docs/field_types.md +++ b/docs/field_types.md @@ -121,8 +121,7 @@ db.select('SELECT * FROM $db.event', model_class=Event) Working with nullable fields ---------------------------- -From [some time](https://github.com/yandex/ClickHouse/pull/70) ClickHouse provides a NULL value support. -Also see some information [here](https://github.com/yandex/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00395_nullable.sql). +[ClickHouse provides a NULL value support](https://clickhouse.yandex/docs/en/data_types/nullable). Wrapping another field in a `NullableField` makes it possible to assign `None` to that field. For example: @@ -148,6 +147,8 @@ to `None`. NOTE: `ArrayField` of `NullableField` is not supported. Also `EnumField` cannot be nullable. +NOTE: Using `Nullable` almost always negatively affects performance, keep this in mind when designing your databases. + Working with field compression codecs ------------------------------------- Besides default data compression, defined in server settings, per-field specification is also available. diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 8b6d9f7..4a4e808 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -76,15 +76,20 @@ class Field(object): ''' sql = self.db_type if with_default_expression: - if self.alias: - sql += ' ALIAS %s' % self.alias - elif self.materialized: - sql += ' MATERIALIZED %s' % self.materialized - else: - default = self.to_db_string(self.default) - sql += ' DEFAULT %s' % default - if self.codec and db and db.has_codec_support: - sql+= ' CODEC(%s)' % self.codec + sql += self._extra_params(db) + return sql + + def _extra_params(self, db): + sql = '' + if self.alias: + sql += ' ALIAS %s' % self.alias + elif self.materialized: + sql += ' MATERIALIZED %s' % self.materialized + elif self.default: + default = self.to_db_string(self.default) + sql += ' DEFAULT %s' % default + if self.codec and db and db.has_codec_support: + sql += ' CODEC(%s)' % self.codec return sql def isinstance(self, types): @@ -511,15 +516,7 @@ class NullableField(Field): def get_sql(self, with_default_expression=True, db=None): sql = 'Nullable(%s)' % self.inner_field.get_sql(with_default_expression=False, db=db) if with_default_expression: - if self.alias: - sql += ' ALIAS %s' % self.alias - elif self.materialized: - sql += ' MATERIALIZED %s' % self.materialized - elif self.default: - default = self.to_db_string(self.default) - sql += ' DEFAULT %s' % default - if self.codec and db and db.has_codec_support: - sql+= ' CODEC(%s)' % self.codec + sql += self._extra_params(db) return sql @@ -549,13 +546,5 @@ class LowCardinalityField(Field): sql = self.inner_field.get_sql(with_default_expression=False) logger.warning('LowCardinalityField not supported on clickhouse-server version < 19.0 using {} as fallback'.format(self.inner_field.__class__.__name__)) if with_default_expression: - if self.alias: - sql += ' ALIAS %s' % self.alias - elif self.materialized: - sql += ' MATERIALIZED %s' % self.materialized - elif self.default: - default = self.to_db_string(self.default) - sql += ' DEFAULT %s' % default - if self.codec and db and db.has_codec_support: - sql+= ' CODEC(%s)' % self.codec + sql += self._extra_params(db) return sql From 38bb4981b8ab60ca7dc57a291330df091387c42f Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 25 Jun 2019 07:46:37 +0300 Subject: [PATCH 6/9] Update docs --- CHANGELOG.md | 5 ++++ docs/class_reference.md | 54 +++++++++++++++++++++++------------------ docs/toc.md | 3 ++- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b4b646f..925e833 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ Change Log ========== +Unreleased +---------- +- Add support for per-field compression codecs (rbelio, Chocorean) +- Add support for low cardinality fields (rbelio) + v1.1.0 ------ - Add PREWHERE support to querysets (M1hacka) diff --git a/docs/class_reference.md b/docs/class_reference.md index fb41cfd..bcace93 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -10,7 +10,7 @@ infi.clickhouse_orm.database Database instances connect to a specific ClickHouse database for running queries, inserting data and other operations. -#### Database(db_name, db_url="http://localhost:8123/", username=None, password=None, readonly=False, autocreate=True, timeout=60, verify_ssl_cert=True) +#### Database(db_name, db_url="http://localhost:8123/", username=None, password=None, readonly=False, autocreate=True, timeout=60, verify_ssl_cert=True, log_statements=False) Initializes a database instance. Unless it's readonly, the database will be @@ -24,6 +24,7 @@ created on the ClickHouse server if it does not already exist. - `autocreate`: automatically create the database if it does not exist (unless in readonly mode). - `timeout`: the connection timeout in seconds. - `verify_ssl_cert`: whether to verify the server's certificate when connecting via HTTPS. +- `log_statements`: when True, all database statements are logged. #### add_setting(name, value) @@ -510,7 +511,7 @@ infi.clickhouse_orm.fields Extends Field -#### ArrayField(inner_field, default=None, alias=None, materialized=None, readonly=None) +#### ArrayField(inner_field, default=None, alias=None, materialized=None, readonly=None, codec=None) ### BaseEnumField @@ -520,7 +521,7 @@ Extends Field Abstract base class for all enum-type fields. -#### BaseEnumField(enum_cls, default=None, alias=None, materialized=None, readonly=None) +#### BaseEnumField(enum_cls, default=None, alias=None, materialized=None, readonly=None, codec=None) ### BaseFloatField @@ -530,7 +531,7 @@ Extends Field Abstract base class for all float-type fields. -#### BaseFloatField(default=None, alias=None, materialized=None, readonly=None) +#### BaseFloatField(default=None, alias=None, materialized=None, readonly=None, codec=None) ### BaseIntField @@ -540,21 +541,21 @@ Extends Field Abstract base class for all integer-type fields. -#### BaseIntField(default=None, alias=None, materialized=None, readonly=None) +#### BaseIntField(default=None, alias=None, materialized=None, readonly=None, codec=None) ### DateField Extends Field -#### DateField(default=None, alias=None, materialized=None, readonly=None) +#### DateField(default=None, alias=None, materialized=None, readonly=None, codec=None) ### DateTimeField Extends Field -#### DateTimeField(default=None, alias=None, materialized=None, readonly=None) +#### DateTimeField(default=None, alias=None, materialized=None, readonly=None, codec=None) ### Decimal128Field @@ -592,14 +593,14 @@ Base class for all decimal fields. Can also be used directly. Extends BaseEnumField -#### Enum16Field(enum_cls, default=None, alias=None, materialized=None, readonly=None) +#### Enum16Field(enum_cls, default=None, alias=None, materialized=None, readonly=None, codec=None) ### Enum8Field Extends BaseEnumField -#### Enum8Field(enum_cls, default=None, alias=None, materialized=None, readonly=None) +#### Enum8Field(enum_cls, default=None, alias=None, materialized=None, readonly=None, codec=None) ### Field @@ -607,7 +608,7 @@ Extends BaseEnumField Abstract base class for all field types. -#### Field(default=None, alias=None, materialized=None, readonly=None) +#### Field(default=None, alias=None, materialized=None, readonly=None, codec=None) ### FixedStringField @@ -621,91 +622,98 @@ Extends StringField Extends BaseFloatField -#### Float32Field(default=None, alias=None, materialized=None, readonly=None) +#### Float32Field(default=None, alias=None, materialized=None, readonly=None, codec=None) ### Float64Field Extends BaseFloatField -#### Float64Field(default=None, alias=None, materialized=None, readonly=None) +#### Float64Field(default=None, alias=None, materialized=None, readonly=None, codec=None) ### Int16Field Extends BaseIntField -#### Int16Field(default=None, alias=None, materialized=None, readonly=None) +#### Int16Field(default=None, alias=None, materialized=None, readonly=None, codec=None) ### Int32Field Extends BaseIntField -#### Int32Field(default=None, alias=None, materialized=None, readonly=None) +#### Int32Field(default=None, alias=None, materialized=None, readonly=None, codec=None) ### Int64Field Extends BaseIntField -#### Int64Field(default=None, alias=None, materialized=None, readonly=None) +#### Int64Field(default=None, alias=None, materialized=None, readonly=None, codec=None) ### Int8Field Extends BaseIntField -#### Int8Field(default=None, alias=None, materialized=None, readonly=None) +#### Int8Field(default=None, alias=None, materialized=None, readonly=None, codec=None) + + +### LowCardinalityField + +Extends Field + +#### LowCardinalityField(inner_field, default=None, alias=None, materialized=None, readonly=None, codec=None) ### NullableField Extends Field -#### NullableField(inner_field, default=None, alias=None, materialized=None, extra_null_values=None) +#### NullableField(inner_field, default=None, alias=None, materialized=None, extra_null_values=None, codec=None) ### StringField Extends Field -#### StringField(default=None, alias=None, materialized=None, readonly=None) +#### StringField(default=None, alias=None, materialized=None, readonly=None, codec=None) ### UInt16Field Extends BaseIntField -#### UInt16Field(default=None, alias=None, materialized=None, readonly=None) +#### UInt16Field(default=None, alias=None, materialized=None, readonly=None, codec=None) ### UInt32Field Extends BaseIntField -#### UInt32Field(default=None, alias=None, materialized=None, readonly=None) +#### UInt32Field(default=None, alias=None, materialized=None, readonly=None, codec=None) ### UInt64Field Extends BaseIntField -#### UInt64Field(default=None, alias=None, materialized=None, readonly=None) +#### UInt64Field(default=None, alias=None, materialized=None, readonly=None, codec=None) ### UInt8Field Extends BaseIntField -#### UInt8Field(default=None, alias=None, materialized=None, readonly=None) +#### UInt8Field(default=None, alias=None, materialized=None, readonly=None, codec=None) ### UUIDField Extends Field -#### UUIDField(default=None, alias=None, materialized=None, readonly=None) +#### UUIDField(default=None, alias=None, materialized=None, readonly=None, codec=None) infi.clickhouse_orm.engines diff --git a/docs/toc.md b/docs/toc.md index 05f384b..af35983 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -36,7 +36,7 @@ * [Working with array fields](field_types.md#working-with-array-fields) * [Working with materialized and alias fields](field_types.md#working-with-materialized-and-alias-fields) * [Working with nullable fields](field_types.md#working-with-nullable-fields) - * [Creating custom field types](field_types.md#creating-custom-field-types) + * [Working with field compression codecs](field_types.md#working-with-field-compression-codecs) * [Table Engines](table_engines.md#table-engines) * [Simple Engines](table_engines.md#simple-engines) @@ -86,6 +86,7 @@ * [Int32Field](class_reference.md#int32field) * [Int64Field](class_reference.md#int64field) * [Int8Field](class_reference.md#int8field) + * [LowCardinalityField](class_reference.md#lowcardinalityfield) * [NullableField](class_reference.md#nullablefield) * [StringField](class_reference.md#stringfield) * [UInt16Field](class_reference.md#uint16field) From 7fcbad44b99f5f7fab02acf1323ff9fc2ae8cdf2 Mon Sep 17 00:00:00 2001 From: Roy Belio Date: Tue, 25 Jun 2019 11:22:32 +0300 Subject: [PATCH 7/9] RAMEN-208 Support codec compression for clickhouse typo fix --- docs/field_types.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/field_types.md b/docs/field_types.md index 73437c0..1934b87 100644 --- a/docs/field_types.md +++ b/docs/field_types.md @@ -186,7 +186,7 @@ class Stats(models.Model): engine = MergeTree('timestamp_date', ('id', 'timestamp')) ``` -:exclamation:**_This feature is supported on clickhouse version 19.1.16 and above, codec arguments will be ignored by the ORM for clickhouse versions lower than 19.1.16_** +:exclamation:**_This feature is supported on ClickHouse version 19.1.16 and above, codec arguments will be ignored by the ORM for ClickHouse versions lower than 19.1.16_** Creating custom field types --------------------------- From 8d5e47a9575c6140969e2f529efa6f15dda4f07a Mon Sep 17 00:00:00 2001 From: Roy Belio Date: Tue, 25 Jun 2019 13:24:06 +0300 Subject: [PATCH 8/9] RAMEN-206 Support LowCardinality in infi.clickhouse_orm added documentation --- docs/field_types.md | 31 +++++++++++++++++++++++++++++++ docs/toc.md | 2 ++ 2 files changed, 33 insertions(+) diff --git a/docs/field_types.md b/docs/field_types.md index 1934b87..5817ca7 100644 --- a/docs/field_types.md +++ b/docs/field_types.md @@ -188,6 +188,37 @@ class Stats(models.Model): ``` :exclamation:**_This feature is supported on ClickHouse version 19.1.16 and above, codec arguments will be ignored by the ORM for ClickHouse versions lower than 19.1.16_** +Working with LowCardinality fields +---------------------------------- +Starting with version 19.0 ClickHouse offers a new type of field to improve the performance of queries +and compaction of columns for low entropy data. + +[More specifically](https://github.com/yandex/ClickHouse/issues/4074) LowCardinality data type builds dictionaries automatically. It can use multiple different dictionaries if necessarily. +If the number of distinct values is pretty large, the dictionaries become local, several different dictionaries will be used for different ranges of data. For example, if you have too many distinct values in total, but only less than about a million values each day - then the queries by day will be processed efficiently, and queries for larger ranges will be processed rather efficiently. + +LowCardinality works independently of (generic) fields compression. +LowCardinality fields are subsequently compressed as usual. +The compression ratios of LowCardinality fields for text data may be significantly better than without LowCardinality. + +LowCardinality will give performance boost, in the form of processing speed, if the number of distinct values is less than a few millions. This is because data is processed in dictionary encoded form. + +You can find further information about LowCardinality in [this presentation](https://github.com/yandex/clickhouse-presentations/blob/master/meetup19/string_optimization.pdf). + +Usage example: +```python +class LowCardinalityModel(models.Model): + date = fields.DateField() + int32 = fields.LowCardinalityField(fields.Int32Field()) + float32 = fields.LowCardinalityField(fields.Float32Field()) + string = fields.LowCardinalityField(fields.StringField()) + nullable = fields.LowCardinalityField(fields.NullableField(fields.StringField())) + array = fields.ArrayField(fields.LowCardinalityField(fields.UInt64Field())) + + engine = MergeTree('date', ('date',)) +``` + +:exclamation:**_LowCardinality field with inner array field is not supported. Use Array field with LowCardinality inner field as seen in the example._** + Creating custom field types --------------------------- Sometimes it is convenient to use data types that are supported in Python, but have no corresponding column type in ClickHouse. In these cases it is possible to define a custom field class that knows how to convert the Pythonic object to a suitable representation in the database, and vice versa. diff --git a/docs/toc.md b/docs/toc.md index af35983..0c81cb3 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -37,6 +37,8 @@ * [Working with materialized and alias fields](field_types.md#working-with-materialized-and-alias-fields) * [Working with nullable fields](field_types.md#working-with-nullable-fields) * [Working with field compression codecs](field_types.md#working-with-field-compression-codecs) + * [Working with LowCardinality fields](field_types.md#working-with-lowcardinality-fields) + * [Creating custom field types](field_types.md#creating-custom-field-types) * [Table Engines](table_engines.md#table-engines) * [Simple Engines](table_engines.md#simple-engines) From 7da75e10ebbaa737164de8a1b73939d1352c354a Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sat, 13 Jul 2019 10:45:19 +0300 Subject: [PATCH 9/9] Releasing v1.2.0 --- CHANGELOG.md | 4 ++-- docs/field_types.md | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 925e833..97042a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ Change Log ========== -Unreleased ----------- +v1.2.0 +------ - Add support for per-field compression codecs (rbelio, Chocorean) - Add support for low cardinality fields (rbelio) diff --git a/docs/field_types.md b/docs/field_types.md index 5817ca7..c1ceb40 100644 --- a/docs/field_types.md +++ b/docs/field_types.md @@ -25,7 +25,7 @@ Currently the following field types are supported: | Decimal32Field | Decimal32 | Decimal | Ditto | Decimal64Field | Decimal64 | Decimal | Ditto | Decimal128Field | Decimal128 | Decimal | Ditto -| UUIDField | UUID | Decimal | +| UUIDField | UUID | Decimal | | Enum8Field | Enum8 | Enum | See below | Enum16Field | Enum16 | Enum | See below | ArrayField | Array | list | See below @@ -160,7 +160,7 @@ Supported compression algorithms: | NONE | None | No compression. | LZ4 | None | LZ4 compression. | LZ4HC(`level`) | Possible `level` range: [3, 12]. | Default value: 9. Greater values stands for better compression and higher CPU usage. Recommended value range: [4,9]. -| ZSTD(`level`) | Possible `level`range: [1, 22]. | Default value: 1. Greater values stands for better compression and higher CPU usage. Levels >= 20, should be used with caution, as they require more memory. +| ZSTD(`level`) | Possible `level`range: [1, 22]. | Default value: 1. Greater values stands for better compression and higher CPU usage. Levels >= 20, should be used with caution, as they require more memory. | Delta(`delta_bytes`) | Possible `delta_bytes` range: 1, 2, 4 , 8. | Default value for `delta_bytes` is `sizeof(type)` if it is equal to 1, 2,4 or 8 and equals to 1 otherwise. Codecs can be combined in a pipeline. Default table codec is not included into pipeline (if it should be applied to a field, you have to specify it explicitly in pipeline). @@ -186,12 +186,13 @@ class Stats(models.Model): engine = MergeTree('timestamp_date', ('id', 'timestamp')) ``` -:exclamation:**_This feature is supported on ClickHouse version 19.1.16 and above, codec arguments will be ignored by the ORM for ClickHouse versions lower than 19.1.16_** + +Note: This feature is supported on ClickHouse version 19.1.16 and above. Codec arguments will be ignored by the ORM for older versions of ClickHouse. Working with LowCardinality fields ---------------------------------- Starting with version 19.0 ClickHouse offers a new type of field to improve the performance of queries -and compaction of columns for low entropy data. +and compaction of columns for low entropy data. [More specifically](https://github.com/yandex/ClickHouse/issues/4074) LowCardinality data type builds dictionaries automatically. It can use multiple different dictionaries if necessarily. If the number of distinct values is pretty large, the dictionaries become local, several different dictionaries will be used for different ranges of data. For example, if you have too many distinct values in total, but only less than about a million values each day - then the queries by day will be processed efficiently, and queries for larger ranges will be processed rather efficiently. @@ -217,7 +218,7 @@ class LowCardinalityModel(models.Model): engine = MergeTree('date', ('date',)) ``` -:exclamation:**_LowCardinality field with inner array field is not supported. Use Array field with LowCardinality inner field as seen in the example._** +Note: `LowCardinality` field with an inner array field is not supported. Use an `ArrayField` with a `LowCardinality` inner field as seen in the example. Creating custom field types ---------------------------