mirror of
https://github.com/Infinidat/infi.clickhouse_orm.git
synced 2024-11-22 09:06:41 +03:00
RAMEN-208 Support codec compression for clickhouse
This commit is contained in:
parent
40725372fe
commit
4d2ebd65fb
|
@ -148,6 +148,45 @@ to `None`.
|
|||
|
||||
NOTE: `ArrayField` of `NullableField` is not supported. Also `EnumField` cannot be nullable.
|
||||
|
||||
Working with field compression codecs
|
||||
-------------------------------------
|
||||
Besides default data compression, defined in server settings, per-field specification is also available.
|
||||
|
||||
Supported compression algorithms:
|
||||
|
||||
| Codec | Argument | Comment
|
||||
| -------------------- | -------------------------------------------| ----------------------------------------------------
|
||||
| NONE | None | No compression.
|
||||
| LZ4 | None | LZ4 compression.
|
||||
| LZ4HC(`level`) | Possible `level` range: [3, 12]. | Default value: 9. Greater values stands for better compression and higher CPU usage. Recommended value range: [4,9].
|
||||
| ZSTD(`level`) | Possible `level`range: [1, 22]. | Default value: 1. Greater values stands for better compression and higher CPU usage. Levels >= 20, should be used with caution, as they require more memory.
|
||||
| Delta(`delta_bytes`) | Possible `delta_bytes` range: 1, 2, 4 , 8. | Default value for `delta_bytes` is `sizeof(type)` if it is equal to 1, 2,4 or 8 and equals to 1 otherwise.
|
||||
|
||||
Codecs can be combined in a pipeline. Default table codec is not included into pipeline (if it should be applied to a field, you have to specify it explicitly in pipeline).
|
||||
|
||||
Recommended usage for codecs:
|
||||
- Usually, values for particular metric, stored in path does not differ significantly from point to point. Using delta-encoding allows to reduce disk space usage significantly.
|
||||
- DateTime works great with pipeline of Delta, ZSTD and the column size can be compressed to 2-3% of its original size (given a smooth datetime data)
|
||||
- Numeric types usually enjoy best compression rates with ZSTD
|
||||
- String types enjoy good compression rates with LZ4HC
|
||||
|
||||
Usage:
|
||||
```python
|
||||
class Stats(models.Model):
|
||||
|
||||
id = fields.UInt64Field(codec='ZSTD(10)')
|
||||
timestamp = fields.DateTimeField(codec='Delta,ZSTD')
|
||||
timestamp_date = fields.DateField(codec='Delta(4),ZSTD(22)')
|
||||
metadata_id = fields.Int64Field(codec='LZ4')
|
||||
status = fields.StringField(codec='LZ4HC(10)')
|
||||
calculation = fields.NullableField(fields.Float32Field(), codec='ZSTD')
|
||||
alerts = fields.ArrayField(fields.FixedStringField(length=15), codec='Delta(2),LZ4HC')
|
||||
|
||||
engine = MergeTree('timestamp_date', ('id', 'timestamp'))
|
||||
|
||||
```
|
||||
:exclamation:**_This feature is supported on clickhouse version 19.1.16 and above, codec arguments will be ignored by the ORM for clickhouse versions lower than 19.1.16_**
|
||||
|
||||
Creating custom field types
|
||||
---------------------------
|
||||
Sometimes it is convenient to use data types that are supported in Python, but have no corresponding column type in ClickHouse. In these cases it is possible to define a custom field class that knows how to convert the Pythonic object to a suitable representation in the database, and vice versa.
|
||||
|
|
|
@ -120,6 +120,8 @@ class Database(object):
|
|||
self.server_version = self._get_server_version()
|
||||
# Versions 1.1.53981 and below don't have timezone function
|
||||
self.server_timezone = self._get_server_timezone() if self.server_version > (1, 1, 53981) else pytz.utc
|
||||
# Versions 19.1.16 and below don't support codec compression
|
||||
self.has_codec_support = self.server_version >= (19, 1, 16)
|
||||
|
||||
def create_database(self):
|
||||
'''
|
||||
|
|
|
@ -19,7 +19,7 @@ class Field(object):
|
|||
class_default = 0
|
||||
db_type = None
|
||||
|
||||
def __init__(self, default=None, alias=None, materialized=None, readonly=None):
|
||||
def __init__(self, default=None, alias=None, materialized=None, readonly=None, codec=None):
|
||||
assert (None, None) in {(default, alias), (alias, materialized), (default, materialized)}, \
|
||||
"Only one of default, alias and materialized parameters can be given"
|
||||
assert alias is None or isinstance(alias, string_types) and alias != "",\
|
||||
|
@ -27,6 +27,8 @@ class Field(object):
|
|||
assert materialized is None or isinstance(materialized, string_types) and alias != "",\
|
||||
"Materialized field must be string, if given"
|
||||
assert readonly is None or type(readonly) is bool, "readonly parameter must be bool if given"
|
||||
assert codec is None or isinstance(codec, string_types) and codec != "", \
|
||||
"Codec field must be string, if given"
|
||||
|
||||
self.creation_counter = Field.creation_counter
|
||||
Field.creation_counter += 1
|
||||
|
@ -34,6 +36,7 @@ class Field(object):
|
|||
self.alias = alias
|
||||
self.materialized = materialized
|
||||
self.readonly = bool(self.alias or self.materialized or readonly)
|
||||
self.codec = codec
|
||||
|
||||
def to_python(self, value, timezone_in_use):
|
||||
'''
|
||||
|
@ -64,22 +67,25 @@ class Field(object):
|
|||
'''
|
||||
return escape(value, quote)
|
||||
|
||||
def get_sql(self, with_default_expression=True):
|
||||
def get_sql(self, with_default_expression=True, db=None):
|
||||
'''
|
||||
Returns an SQL expression describing the field (e.g. for CREATE TABLE).
|
||||
:param with_default_expression: If True, adds default value to sql.
|
||||
It doesn't affect fields with alias and materialized values.
|
||||
:param db: Database, used for checking supported features.
|
||||
'''
|
||||
sql = self.db_type
|
||||
if with_default_expression:
|
||||
if self.alias:
|
||||
return '%s ALIAS %s' % (self.db_type, self.alias)
|
||||
sql += ' ALIAS %s' % self.alias
|
||||
elif self.materialized:
|
||||
return '%s MATERIALIZED %s' % (self.db_type, self.materialized)
|
||||
sql += ' MATERIALIZED %s' % self.materialized
|
||||
else:
|
||||
default = self.to_db_string(self.default)
|
||||
return '%s DEFAULT %s' % (self.db_type, default)
|
||||
else:
|
||||
return self.db_type
|
||||
sql += ' DEFAULT %s' % default
|
||||
if self.codec and db and db.has_codec_support:
|
||||
sql+= ' CODEC(%s)' % self.codec
|
||||
return sql
|
||||
|
||||
def isinstance(self, types):
|
||||
"""
|
||||
|
@ -361,11 +367,11 @@ class BaseEnumField(Field):
|
|||
Abstract base class for all enum-type fields.
|
||||
'''
|
||||
|
||||
def __init__(self, enum_cls, default=None, alias=None, materialized=None, readonly=None):
|
||||
def __init__(self, enum_cls, default=None, alias=None, materialized=None, readonly=None, codec=None):
|
||||
self.enum_cls = enum_cls
|
||||
if default is None:
|
||||
default = list(enum_cls)[0]
|
||||
super(BaseEnumField, self).__init__(default, alias, materialized, readonly)
|
||||
super(BaseEnumField, self).__init__(default, alias, materialized, readonly, codec)
|
||||
|
||||
def to_python(self, value, timezone_in_use):
|
||||
if isinstance(value, self.enum_cls):
|
||||
|
@ -384,12 +390,14 @@ class BaseEnumField(Field):
|
|||
def to_db_string(self, value, quote=True):
|
||||
return escape(value.name, quote)
|
||||
|
||||
def get_sql(self, with_default_expression=True):
|
||||
def get_sql(self, with_default_expression=True, db=None):
|
||||
values = ['%s = %d' % (escape(item.name), item.value) for item in self.enum_cls]
|
||||
sql = '%s(%s)' % (self.db_type, ' ,'.join(values))
|
||||
if with_default_expression:
|
||||
default = self.to_db_string(self.default)
|
||||
sql = '%s DEFAULT %s' % (sql, default)
|
||||
if self.codec and db and db.has_codec_support:
|
||||
sql+= ' CODEC(%s)' % self.codec
|
||||
return sql
|
||||
|
||||
@classmethod
|
||||
|
@ -425,11 +433,11 @@ class ArrayField(Field):
|
|||
|
||||
class_default = []
|
||||
|
||||
def __init__(self, inner_field, default=None, alias=None, materialized=None, readonly=None):
|
||||
def __init__(self, inner_field, default=None, alias=None, materialized=None, readonly=None, codec=None):
|
||||
assert isinstance(inner_field, Field), "The first argument of ArrayField must be a Field instance"
|
||||
assert not isinstance(inner_field, ArrayField), "Multidimensional array fields are not supported by the ORM"
|
||||
self.inner_field = inner_field
|
||||
super(ArrayField, self).__init__(default, alias, materialized, readonly)
|
||||
super(ArrayField, self).__init__(default, alias, materialized, readonly, codec)
|
||||
|
||||
def to_python(self, value, timezone_in_use):
|
||||
if isinstance(value, text_type):
|
||||
|
@ -448,9 +456,11 @@ class ArrayField(Field):
|
|||
array = [self.inner_field.to_db_string(v, quote=True) for v in value]
|
||||
return '[' + comma_join(array) + ']'
|
||||
|
||||
def get_sql(self, with_default_expression=True):
|
||||
from .utils import escape
|
||||
return 'Array(%s)' % self.inner_field.get_sql(with_default_expression=False)
|
||||
def get_sql(self, with_default_expression=True, db=None):
|
||||
sql = 'Array(%s)' % self.inner_field.get_sql(with_default_expression=False)
|
||||
if self.codec and db and db.has_codec_support:
|
||||
sql+= ' CODEC(%s)' % self.codec
|
||||
return sql
|
||||
|
||||
|
||||
class UUIDField(Field):
|
||||
|
@ -481,12 +491,12 @@ class NullableField(Field):
|
|||
class_default = None
|
||||
|
||||
def __init__(self, inner_field, default=None, alias=None, materialized=None,
|
||||
extra_null_values=None):
|
||||
extra_null_values=None, codec=None):
|
||||
self.inner_field = inner_field
|
||||
self._null_values = [None]
|
||||
if extra_null_values:
|
||||
self._null_values.extend(extra_null_values)
|
||||
super(NullableField, self).__init__(default, alias, materialized, readonly=None)
|
||||
super(NullableField, self).__init__(default, alias, materialized, readonly=None, codec=codec)
|
||||
|
||||
def to_python(self, value, timezone_in_use):
|
||||
if value == '\\N' or value in self._null_values:
|
||||
|
@ -501,14 +511,16 @@ class NullableField(Field):
|
|||
return '\\N'
|
||||
return self.inner_field.to_db_string(value, quote=quote)
|
||||
|
||||
def get_sql(self, with_default_expression=True):
|
||||
s = 'Nullable(%s)' % self.inner_field.get_sql(with_default_expression=False)
|
||||
def get_sql(self, with_default_expression=True, db=None):
|
||||
sql = 'Nullable(%s)' % self.inner_field.get_sql(with_default_expression=False)
|
||||
if with_default_expression:
|
||||
if self.alias:
|
||||
s = '%s ALIAS %s' % (s, self.alias)
|
||||
sql += ' ALIAS %s' % self.alias
|
||||
elif self.materialized:
|
||||
s = '%s MATERIALIZED %s' % (s, self.materialized)
|
||||
sql += ' MATERIALIZED %s' % self.materialized
|
||||
elif self.default:
|
||||
default = self.to_db_string(self.default)
|
||||
s = '%s DEFAULT %s' % (s, default)
|
||||
return s
|
||||
sql += ' DEFAULT %s' % default
|
||||
if self.codec and db and db.has_codec_support:
|
||||
sql+= ' CODEC(%s)' % self.codec
|
||||
return sql
|
||||
|
|
|
@ -79,7 +79,7 @@ class AlterTable(Operation):
|
|||
if name not in table_fields:
|
||||
logger.info(' Add column %s', name)
|
||||
assert prev_name, 'Cannot add a column to the beginning of the table'
|
||||
cmd = 'ADD COLUMN %s %s' % (name, field.get_sql())
|
||||
cmd = 'ADD COLUMN %s %s' % (name, field.get_sql(db=database))
|
||||
if is_regular_field:
|
||||
cmd += ' AFTER %s' % prev_name
|
||||
self._alter_table(database, cmd)
|
||||
|
@ -93,7 +93,7 @@ class AlterTable(Operation):
|
|||
# The order of class attributes can be changed any time, so we can't count on it
|
||||
# Secondly, MATERIALIZED and ALIAS fields are always at the end of the DESC, so we can't expect them to save
|
||||
# attribute position. Watch https://github.com/Infinidat/infi.clickhouse_orm/issues/47
|
||||
model_fields = {name: field.get_sql(with_default_expression=False)
|
||||
model_fields = {name: field.get_sql(with_default_expression=False, db=database)
|
||||
for name, field in iteritems(self.model_class.fields())}
|
||||
for field_name, field_sql in self._get_table_fields(database):
|
||||
# All fields must have been created and dropped by this moment
|
||||
|
|
|
@ -190,7 +190,7 @@ class Model(with_metaclass(ModelBase)):
|
|||
parts = ['CREATE TABLE IF NOT EXISTS `%s`.`%s` (' % (db.db_name, cls.table_name())]
|
||||
cols = []
|
||||
for name, field in iteritems(cls.fields()):
|
||||
cols.append(' %s %s' % (name, field.get_sql()))
|
||||
cols.append(' %s %s' % (name, field.get_sql(db=db)))
|
||||
parts.append(',\n'.join(cols))
|
||||
parts.append(')')
|
||||
parts.append('ENGINE = ' + cls.engine.create_table_sql(db))
|
||||
|
@ -316,7 +316,7 @@ class MergeModel(Model):
|
|||
cols = []
|
||||
for name, field in iteritems(cls.fields()):
|
||||
if name != '_table':
|
||||
cols.append(' %s %s' % (name, field.get_sql()))
|
||||
cols.append(' %s %s' % (name, field.get_sql(db=db)))
|
||||
parts.append(',\n'.join(cols))
|
||||
parts.append(')')
|
||||
parts.append('ENGINE = ' + cls.engine.create_table_sql(db))
|
||||
|
|
6
tests/sample_migrations/0015.py
Normal file
6
tests/sample_migrations/0015.py
Normal file
|
@ -0,0 +1,6 @@
|
|||
from infi.clickhouse_orm import migrations
|
||||
from ..test_migrations import *
|
||||
|
||||
operations = [
|
||||
migrations.AlterTableWithBuffer(Model4_compressed)
|
||||
]
|
123
tests/test_compressed_fields.py
Normal file
123
tests/test_compressed_fields.py
Normal file
|
@ -0,0 +1,123 @@
|
|||
from __future__ import unicode_literals
|
||||
import unittest
|
||||
import datetime
|
||||
import pytz
|
||||
|
||||
from infi.clickhouse_orm.database import Database
|
||||
from infi.clickhouse_orm.models import Model
|
||||
from infi.clickhouse_orm.fields import *
|
||||
from infi.clickhouse_orm.engines import *
|
||||
from infi.clickhouse_orm.utils import parse_tsv
|
||||
|
||||
|
||||
class CompressedFieldsTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.database = Database('test-db', log_statements=True)
|
||||
self.database.create_table(CompressedModel)
|
||||
|
||||
def tearDown(self):
|
||||
self.database.drop_database()
|
||||
|
||||
def test_defaults(self):
|
||||
# Check that all fields have their explicit or implicit defaults
|
||||
instance = CompressedModel()
|
||||
self.database.insert([instance])
|
||||
self.assertEqual(instance.date_field, datetime.date(1970, 1, 1))
|
||||
self.assertEqual(instance.datetime_field, datetime.datetime(1970, 1, 1, tzinfo=pytz.utc))
|
||||
self.assertEqual(instance.string_field, 'dozo')
|
||||
self.assertEqual(instance.int64_field, 42)
|
||||
self.assertEqual(instance.float_field, 0)
|
||||
self.assertEqual(instance.nullable_field, None)
|
||||
self.assertEqual(instance.array_field, [])
|
||||
|
||||
def test_assignment(self):
|
||||
# Check that all fields are assigned during construction
|
||||
kwargs = dict(
|
||||
uint64_field=217,
|
||||
date_field=datetime.date(1973, 12, 6),
|
||||
datetime_field=datetime.datetime(2000, 5, 24, 10, 22, tzinfo=pytz.utc),
|
||||
string_field='aloha',
|
||||
int64_field=-50,
|
||||
float_field=3.14,
|
||||
nullable_field=-2.718281,
|
||||
array_field=['123456789123456','','a']
|
||||
)
|
||||
instance = CompressedModel(**kwargs)
|
||||
self.database.insert([instance])
|
||||
for name, value in kwargs.items():
|
||||
self.assertEqual(kwargs[name], getattr(instance, name))
|
||||
|
||||
def test_string_conversion(self):
|
||||
# Check field conversion from string during construction
|
||||
instance = CompressedModel(date_field='1973-12-06', int64_field='100', float_field='7', nullable_field=None, array_field='[a,b,c]')
|
||||
self.assertEqual(instance.date_field, datetime.date(1973, 12, 6))
|
||||
self.assertEqual(instance.int64_field, 100)
|
||||
self.assertEqual(instance.float_field, 7)
|
||||
self.assertEqual(instance.nullable_field, None)
|
||||
self.assertEqual(instance.array_field, ['a', 'b', 'c'])
|
||||
# Check field conversion from string during assignment
|
||||
instance.int64_field = '99'
|
||||
self.assertEqual(instance.int64_field, 99)
|
||||
|
||||
def test_to_dict(self):
|
||||
instance = CompressedModel(date_field='1973-12-06', int64_field='100', float_field='7', array_field='[a,b,c]')
|
||||
self.assertDictEqual(instance.to_dict(), {
|
||||
"date_field": datetime.date(1973, 12, 6),
|
||||
"int64_field": 100,
|
||||
"float_field": 7.0,
|
||||
"datetime_field": datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
|
||||
"alias_field": 0.0,
|
||||
'string_field': 'dozo',
|
||||
'nullable_field': None,
|
||||
'uint64_field': 0,
|
||||
'array_field': ['a','b','c']
|
||||
})
|
||||
self.assertDictEqual(instance.to_dict(include_readonly=False), {
|
||||
"date_field": datetime.date(1973, 12, 6),
|
||||
"int64_field": 100,
|
||||
"float_field": 7.0,
|
||||
"datetime_field": datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
|
||||
'string_field': 'dozo',
|
||||
'nullable_field': None,
|
||||
'uint64_field': 0,
|
||||
'array_field': ['a', 'b', 'c']
|
||||
})
|
||||
self.assertDictEqual(
|
||||
instance.to_dict(include_readonly=False, field_names=('int64_field', 'alias_field', 'datetime_field')), {
|
||||
"int64_field": 100,
|
||||
"datetime_field": datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc)
|
||||
})
|
||||
|
||||
# This test will fail on clickhouse version < 19.1.16, use skip test
|
||||
def test_confirm_compression_codec(self):
|
||||
instance = CompressedModel(date_field='1973-12-06', int64_field='100', float_field='7', array_field='[a,b,c]')
|
||||
self.database.insert([instance])
|
||||
r = self.database.raw("select name, compression_codec from system.columns where table = '{}' and database='{}' FORMAT TabSeparatedWithNamesAndTypes".format(instance.table_name(), self.database.db_name))
|
||||
lines = r.splitlines()
|
||||
field_names = parse_tsv(lines[0])
|
||||
field_types = parse_tsv(lines[1])
|
||||
data = [tuple(parse_tsv(line)) for line in lines[2:]]
|
||||
self.assertListEqual(data, [('uint64_field', 'CODEC(ZSTD(10))'),
|
||||
('datetime_field', 'CODEC(Delta(4), ZSTD(1))'),
|
||||
('date_field', 'CODEC(Delta(4), ZSTD(22))'),
|
||||
('int64_field', 'CODEC(LZ4)'),
|
||||
('string_field', 'CODEC(LZ4HC(10))'),
|
||||
('nullable_field', 'CODEC(ZSTD(1))'),
|
||||
('array_field', 'CODEC(Delta(2), LZ4HC(0))'),
|
||||
('float_field', 'CODEC(NONE)'),
|
||||
('alias_field', 'CODEC(ZSTD(4))')])
|
||||
|
||||
|
||||
class CompressedModel(Model):
|
||||
uint64_field = UInt64Field(codec='ZSTD(10)')
|
||||
datetime_field = DateTimeField(codec='Delta,ZSTD')
|
||||
date_field = DateField(codec='Delta(4),ZSTD(22)')
|
||||
int64_field = Int64Field(default=42, codec='LZ4')
|
||||
string_field = StringField(default='dozo', codec='LZ4HC(10)')
|
||||
nullable_field = NullableField(Float32Field(), codec='ZSTD')
|
||||
array_field = ArrayField(FixedStringField(length=15), codec='Delta(2),LZ4HC')
|
||||
float_field = Float32Field(codec='NONE')
|
||||
alias_field = Float32Field(alias='float_field', codec='ZSTD(4)')
|
||||
|
||||
engine = MergeTree('datetime_field', ('uint64_field', 'datetime_field'))
|
|
@ -258,3 +258,16 @@ class Model4Buffer_changed(BufferModel, Model4_changed):
|
|||
@classmethod
|
||||
def table_name(cls):
|
||||
return 'model4buffer'
|
||||
|
||||
|
||||
class Model4_compressed(Model):
|
||||
|
||||
date = DateField(codec='Delta(4),ZSTD')
|
||||
f3 = DateTimeField(codec='Delta,ZSTD(10)')
|
||||
f2 = StringField(codec='LZ4HC')
|
||||
|
||||
engine = MergeTree('date', ('date',))
|
||||
|
||||
@classmethod
|
||||
def table_name(cls):
|
||||
return 'model4'
|
Loading…
Reference in New Issue
Block a user