From 393209e624c54aa81704af12ce25a8342c400320 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sat, 6 Jun 2020 11:07:01 +0300 Subject: [PATCH 01/27] Support for model constraints --- CHANGELOG.md | 4 + docs/class_reference.md | 168 +++++++++++++++++++++++++- docs/models_and_databases.md | 19 +++ docs/schema_migrations.md | 19 +-- docs/toc.md | 10 ++ scripts/generate_ref.py | 2 +- src/infi/clickhouse_orm/migrations.py | 94 ++++++++++---- 7 files changed, 280 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da7d3c4..b6165b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ Change Log ========== +Unreleased +---------- +- Support for model constraints + v2.0.1 ------ - Remove unnecessary import of `six` diff --git a/docs/class_reference.md b/docs/class_reference.md index 285f9b4..616863f 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -178,7 +178,7 @@ Unrecognized field names will cause an `AttributeError`. #### Model.create_table_sql(db) -Returns the SQL command for creating a table for this model. +Returns the SQL statement for creating a table for this model. #### Model.drop_table_sql(db) @@ -308,7 +308,7 @@ Unrecognized field names will cause an `AttributeError`. #### BufferModel.create_table_sql(db) -Returns the SQL command for creating a table for this model. +Returns the SQL statement for creating a table for this model. #### BufferModel.drop_table_sql(db) @@ -422,12 +422,147 @@ Returns the instance's column values as a tab-separated line. A newline is not i - `include_readonly`: if false, returns only fields that can be inserted into database. +### MergeModel + +Extends Model + + +Model for Merge engine +Predefines virtual _table column an controls that rows can't be inserted to this table type +https://clickhouse.tech/docs/en/single/index.html#document-table_engines/merge + +#### MergeModel(**kwargs) + + +Creates a model instance, using keyword arguments as field values. +Since values are immediately converted to their Pythonic type, +invalid values will cause a `ValueError` to be raised. +Unrecognized field names will cause an `AttributeError`. + + +#### MergeModel.create_table_sql(db) + + +Returns the SQL statement for creating a table for this model. + + +#### MergeModel.drop_table_sql(db) + + +Returns the SQL command for deleting this model's table. + + +#### MergeModel.fields(writable=False) + + +Returns an `OrderedDict` of the model's fields (from name to `Field` instance). +If `writable` is true, only writable fields are included. +Callers should not modify the dictionary. + + +#### MergeModel.from_tsv(line, field_names, timezone_in_use=UTC, database=None) + + +Create a model instance from a tab-separated line. The line may or may not include a newline. +The `field_names` list must match the fields defined in the model, but does not have to include all of them. + +- `line`: the TSV-formatted data. +- `field_names`: names of the model fields in the data. +- `timezone_in_use`: the timezone to use when parsing dates and datetimes. +- `database`: if given, sets the database that this instance belongs to. + + +#### get_database() + + +Gets the `Database` that this model instance belongs to. +Returns `None` unless the instance was read from the database or written to it. + + +#### get_field(name) + + +Gets a `Field` instance given its name, or `None` if not found. + + +#### MergeModel.has_funcs_as_defaults() + + +Return True if some of the model's fields use a function expression +as a default value. This requires special handling when inserting instances. + + +#### MergeModel.is_read_only() + + +Returns true if the model is marked as read only. + + +#### MergeModel.is_system_model() + + +Returns true if the model represents a system table. + + +#### MergeModel.objects_in(database) + + +Returns a `QuerySet` for selecting instances of this model class. + + +#### set_database(db) + + +Sets the `Database` that this model instance belongs to. +This is done automatically when the instance is read from the database or written to it. + + +#### MergeModel.table_name() + + +Returns the model's database table name. By default this is the +class name converted to lowercase. Override this if you want to use +a different table name. + + +#### to_db_string() + + +Returns the instance as a bytestring ready to be inserted into the database. + + +#### to_dict(include_readonly=True, field_names=None) + + +Returns the instance's column values as a dict. + +- `include_readonly`: if false, returns only fields that can be inserted into database. +- `field_names`: an iterable of field names to return (optional) + + +#### to_tskv(include_readonly=True) + + +Returns the instance's column keys and values as a tab-separated line. A newline is not included. +Fields that were not assigned a value are omitted. + +- `include_readonly`: if false, returns only fields that can be inserted into database. + + +#### to_tsv(include_readonly=True) + + +Returns the instance's column values as a tab-separated line. A newline is not included. + +- `include_readonly`: if false, returns only fields that can be inserted into database. + + ### DistributedModel Extends Model -Model for Distributed engine +Model class for use with a `Distributed` engine. #### DistributedModel(**kwargs) @@ -441,6 +576,9 @@ Unrecognized field names will cause an `AttributeError`. #### DistributedModel.create_table_sql(db) +Returns the SQL statement for creating a table for this model. + + #### DistributedModel.drop_table_sql(db) @@ -541,6 +679,10 @@ Returns a `QuerySet` for selecting instances of this model class. #### set_database(db) +Sets the `Database` that this model instance belongs to. +This is done automatically when the instance is read from the database or written to it. + + #### DistributedModel.table_name() @@ -581,6 +723,26 @@ Returns the instance's column values as a tab-separated line. A newline is not i - `include_readonly`: if false, returns only fields that can be inserted into database. +### Constraint + + +Defines a model constraint. + +#### Constraint(expr) + + +Initializer. Requires an expression that ClickHouse will verify when inserting data. + + +#### create_table_sql() + + +Returns the SQL statement for defining this constraint on table creation. + + +#### str() + + infi.clickhouse_orm.fields -------------------------- diff --git a/docs/models_and_databases.md b/docs/models_and_databases.md index b1f262c..928489d 100644 --- a/docs/models_and_databases.md +++ b/docs/models_and_databases.md @@ -75,6 +75,25 @@ The table name used for the model is its class name, converted to lowercase. To def table_name(cls): return 'people' +### Model Constraints + +It is possible to define constraints which ClickHouse verifies when data is inserted. Trying to insert invalid records will raise a `ServerError`. Each constraint has a name and an expression to validate. For example: + + from infi.clickhouse_orm import Model, Constraint, F, StringField, DateField, Float32Field, MergeTree + + class Person(Model): + + first_name = StringField() + last_name = StringField() + birthday = DateField() + height = Float32Field() + + # Ensure that the birthday is not a future date + birthday_is_in_the_past = Constraint(birthday <= F.today()) + + engine = MergeTree('birthday', ('first_name', 'last_name', 'birthday')) + + Using Models ------------ diff --git a/docs/schema_migrations.md b/docs/schema_migrations.md index 1556395..9e3fa01 100644 --- a/docs/schema_migrations.md +++ b/docs/schema_migrations.md @@ -33,19 +33,19 @@ Each migration file is expected to contain a list of `operations`, for example: The following operations are supported: -**CreateTable** +### CreateTable A migration operation that creates a table for a given model class. If the table already exists, the operation does nothing. In case the model class is a `BufferModel`, the operation first creates the underlying on-disk table, and then creates the buffer table. -**DropTable** +### DropTable A migration operation that drops the table of a given model class. If the table does not exist, the operation does nothing. -**AlterTable** +### AlterTable A migration operation that compares the table of a given model class to the model’s fields, and alters the table to match the model. The operation can: @@ -56,14 +56,19 @@ A migration operation that compares the table of a given model class to the mode Default values are not altered by this operation. -**AlterTableWithBuffer** +### AlterTableWithBuffer A compound migration operation for altering a buffer table and its underlying on-disk table. The buffer table is dropped, the on-disk table is altered, and then the buffer table is re-created. This is the procedure recommended in the ClickHouse documentation for handling scenarios in which the underlying table needs to be modified. Applying this migration operation to a regular table has the same effect as an `AlterTable` operation. -**RunPython** +### AlterConstraints + +A migration operation that adds new constraints from the model to the database table, and drops obsolete ones. Constraints are identified by their names, so a change in an existing constraint will not be detected unless its name was changed too. ClickHouse does not check that the constraints hold for existing data in the table. + + +### RunPython A migration operation that runs a Python function. The function receives the `Database` instance to operate on. @@ -77,9 +82,9 @@ A migration operation that runs a Python function. The function receives the `Da ] -**RunSQL** +### RunSQL -A migration operation that runs raw SQL queries. It expects a string containing an SQL query, or an array of SQL-query strings. +A migration operation that runs raw SQL statements. It expects a string containing an SQL statements, or a list of statements. Example: diff --git a/docs/toc.md b/docs/toc.md index 5805eaa..6dd5da0 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -10,6 +10,7 @@ * [Materialized fields](models_and_databases.md#materialized-fields) * [Alias fields](models_and_databases.md#alias-fields) * [Table Names](models_and_databases.md#table-names) + * [Model Constraints](models_and_databases.md#model-constraints) * [Using Models](models_and_databases.md#using-models) * [Inserting to the Database](models_and_databases.md#inserting-to-the-database) * [Reading from the Database](models_and_databases.md#reading-from-the-database) @@ -58,6 +59,13 @@ * [Schema Migrations](schema_migrations.md#schema-migrations) * [Writing Migrations](schema_migrations.md#writing-migrations) + * [CreateTable](schema_migrations.md#createtable) + * [DropTable](schema_migrations.md#droptable) + * [AlterTable](schema_migrations.md#altertable) + * [AlterTableWithBuffer](schema_migrations.md#altertablewithbuffer) + * [AlterConstraints](schema_migrations.md#alterconstraints) + * [RunPython](schema_migrations.md#runpython) + * [RunSQL](schema_migrations.md#runsql) * [Running Migrations](schema_migrations.md#running-migrations) * [System Models](system_models.md#system-models) @@ -74,7 +82,9 @@ * [infi.clickhouse_orm.models](class_reference.md#inficlickhouse_ormmodels) * [Model](class_reference.md#model) * [BufferModel](class_reference.md#buffermodel) + * [MergeModel](class_reference.md#mergemodel) * [DistributedModel](class_reference.md#distributedmodel) + * [Constraint](class_reference.md#constraint) * [infi.clickhouse_orm.fields](class_reference.md#inficlickhouse_ormfields) * [ArrayField](class_reference.md#arrayfield) * [BaseEnumField](class_reference.md#baseenumfield) diff --git a/scripts/generate_ref.py b/scripts/generate_ref.py index 6e537ec..8dc0477 100644 --- a/scripts/generate_ref.py +++ b/scripts/generate_ref.py @@ -132,7 +132,7 @@ if __name__ == '__main__': print('===============') print() module_doc([database.Database, database.DatabaseException]) - module_doc([models.Model, models.BufferModel, models.DistributedModel]) + module_doc([models.Model, models.BufferModel, models.MergeModel, models.DistributedModel, models.Constraint]) module_doc(sorted([fields.Field] + all_subclasses(fields.Field), key=lambda x: x.__name__), False) module_doc([engines.Engine] + all_subclasses(engines.Engine), False) module_doc([query.QuerySet, query.AggregateQuerySet, query.Q]) diff --git a/src/infi/clickhouse_orm/migrations.py b/src/infi/clickhouse_orm/migrations.py index cf93d9a..5361b2b 100644 --- a/src/infi/clickhouse_orm/migrations.py +++ b/src/infi/clickhouse_orm/migrations.py @@ -7,7 +7,7 @@ import logging logger = logging.getLogger('migrations') -class Operation(object): +class Operation(): ''' Base class for migration operations. ''' @@ -16,14 +16,31 @@ class Operation(object): raise NotImplementedError() # pragma: no cover -class CreateTable(Operation): +class ModelOperation(Operation): ''' - A migration operation that creates a table for a given model class. + Base class for migration operations that work on a specific model. ''' def __init__(self, model_class): + ''' + Initializer. + ''' self.model_class = model_class + def _alter_table(self, database, cmd): + ''' + Utility for running ALTER TABLE commands. + ''' + cmd = "ALTER TABLE $db.`%s` %s" % (self.model_class.table_name(), cmd) + logger.debug(cmd) + database.raw(cmd) + + +class CreateTable(ModelOperation): + ''' + A migration operation that creates a table for a given model class. + ''' + def apply(self, database): logger.info(' Create table %s', self.model_class.table_name()) if issubclass(self.model_class, BufferModel): @@ -31,7 +48,7 @@ class CreateTable(Operation): database.create_table(self.model_class) -class AlterTable(Operation): +class AlterTable(ModelOperation): ''' A migration operation that compares the table of a given model class to the model's fields, and alters the table to match the model. The operation can: @@ -41,18 +58,10 @@ class AlterTable(Operation): Default values are not altered by this operation. ''' - def __init__(self, model_class): - self.model_class = model_class - def _get_table_fields(self, database): query = "DESC `%s`.`%s`" % (database.db_name, self.model_class.table_name()) return [(row.name, row.type) for row in database.select(query)] - def _alter_table(self, database, cmd): - cmd = "ALTER TABLE `%s`.`%s` %s" % (database.db_name, self.model_class.table_name(), cmd) - logger.debug(cmd) - database._send(cmd) - def apply(self, database): logger.info(' Alter table %s', self.model_class.table_name()) @@ -100,16 +109,13 @@ class AlterTable(Operation): self._alter_table(database, 'MODIFY COLUMN %s %s' % (field_name, model_fields[field_name])) -class AlterTableWithBuffer(Operation): +class AlterTableWithBuffer(ModelOperation): ''' A migration operation for altering a buffer table and its underlying on-disk table. The buffer table is dropped, the on-disk table is altered, and then the buffer table is re-created. ''' - def __init__(self, model_class): - self.model_class = model_class - def apply(self, database): if issubclass(self.model_class, BufferModel): DropTable(self.model_class).apply(database) @@ -119,25 +125,60 @@ class AlterTableWithBuffer(Operation): AlterTable(self.model_class).apply(database) -class DropTable(Operation): +class DropTable(ModelOperation): ''' A migration operation that drops the table of a given model class. ''' - def __init__(self, model_class): - self.model_class = model_class - def apply(self, database): logger.info(' Drop table %s', self.model_class.table_name()) database.drop_table(self.model_class) +class AlterConstraints(ModelOperation): + ''' + A migration operation that adds new constraints from the model to the database + table, and drops obsolete ones. Constraints are identified by their names, so + a change in an existing constraint will not be detected unless its name was changed too. + ClickHouse does not check that the constraints hold for existing data in the table. + ''' + + def apply(self, database): + logger.info(' Alter constraints for %s', self.model_class.table_name()) + existing = self._get_constraint_names(database) + # Go over constraints in the model + for constraint in self.model_class._constraints.values(): + # Check if it's a new constraint + if constraint.name not in existing: + logger.info(' Add constraint %s', constraint.name) + self._alter_table(database, 'ADD %s' % constraint.create_table_sql()) + else: + existing.remove(constraint.name) + # Remaining constraints in `existing` are obsolete + for name in existing: + logger.info(' Drop constraint %s', name) + self._alter_table(database, 'DROP CONSTRAINT `%s`' % name) + + def _get_constraint_names(self, database): + ''' + Returns a set containing the names of existing constraints in the table. + ''' + import re + create_table_sql = database.raw('SHOW CREATE TABLE $db.`%s`' % self.model_class.table_name()) + matches = re.findall(r'\sCONSTRAINT\s+`?(.+?)`?\s+CHECK\s', create_table_sql, flags=re.IGNORECASE) + return set(matches) + + class RunPython(Operation): ''' - A migration operation that executes given python function on database + A migration operation that executes a Python function. ''' def __init__(self, func): - assert callable(func), "'func' parameter must be function" + ''' + Initializer. The given Python function will be called with a single + argument - the Database instance to apply the migration to. + ''' + assert callable(func), "'func' argument must be function" self._func = func def apply(self, database): @@ -147,14 +188,17 @@ class RunPython(Operation): class RunSQL(Operation): ''' - A migration operation that executes given SQL on database + A migration operation that executes arbitrary SQL statements. ''' def __init__(self, sql): + ''' + Initializer. The given sql argument must be a valid SQL statement or + list of statements. + ''' if isinstance(sql, str): sql = [sql] - - assert isinstance(sql, list), "'sql' parameter must be string or list of strings" + assert isinstance(sql, list), "'sql' argument must be string or list of strings" self._sql = sql def apply(self, database): From ffd9bab0ef58c5670f3a39c978525f5f32a00742 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sat, 6 Jun 2020 11:07:25 +0300 Subject: [PATCH 02/27] Support for model constraints --- src/infi/clickhouse_orm/models.py | 79 +++++++++++++++++++++++++------ tests/sample_migrations/0016.py | 6 +++ tests/sample_migrations/0017.py | 6 +++ tests/test_constraints.py | 45 ++++++++++++++++++ tests/test_migrations.py | 54 ++++++++++++++++++++- 5 files changed, 174 insertions(+), 16 deletions(-) create mode 100644 tests/sample_migrations/0016.py create mode 100644 tests/sample_migrations/0017.py create mode 100644 tests/test_constraints.py diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index e4766e5..506185b 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import sys from collections import OrderedDict +from itertools import chain from logging import getLogger import pytz @@ -14,6 +15,31 @@ from .engines import Merge, Distributed logger = getLogger('clickhouse_orm') + +class Constraint(): + ''' + Defines a model constraint. + ''' + + name = None # this is set by the parent model + parent = None # this is set by the parent model + + def __init__(self, expr): + ''' + Initializer. Expects an expression that ClickHouse will verify when inserting data. + ''' + self.expr = expr + + def create_table_sql(self): + ''' + Returns the SQL statement for defining this constraint during table creation. + ''' + return 'CONSTRAINT `%s` CHECK %s' % (self.name, self.expr) + + def str(self): + return self.create_table_sql() + + class ModelBase(type): ''' A metaclass for ORM models. It adds the _fields list to model classes. @@ -22,18 +48,21 @@ class ModelBase(type): ad_hoc_model_cache = {} def __new__(cls, name, bases, attrs): - # Collect fields from parent classes - base_fields = dict() + # Collect fields and constraints from parent classes + fields = dict() + constraints = dict() for base in bases: if isinstance(base, ModelBase): - base_fields.update(base._fields) + fields.update(base._fields) + constraints.update(base._constraints) - fields = base_fields - - # Build a list of fields, in the order they were listed in the class + # Build a list of (name, field) tuples, in the order they were listed in the class fields.update({n: f for n, f in attrs.items() if isinstance(f, Field)}) fields = sorted(fields.items(), key=lambda item: item[1].creation_counter) + # Build a list of constraints + constraints.update({n: c for n, c in attrs.items() if isinstance(c, Constraint)}) + # Build a dictionary of default values defaults = {} has_funcs_as_defaults = False @@ -49,16 +78,17 @@ class ModelBase(type): attrs = dict( attrs, _fields=OrderedDict(fields), + _constraints=constraints, _writable_fields=OrderedDict([f for f in fields if not f[1].readonly]), _defaults=defaults, _has_funcs_as_defaults=has_funcs_as_defaults ) model = super(ModelBase, cls).__new__(cls, str(name), bases, attrs) - # Let each field know its parent and its own name - for n, f in fields: - setattr(f, 'parent', model) - setattr(f, 'name', n) + # Let each field and constraint know its parent and its own name + for n, obj in chain(fields, constraints.items()): + setattr(obj, 'parent', model) + setattr(obj, 'name', n) return model @@ -222,17 +252,27 @@ class Model(metaclass=ModelBase): @classmethod def create_table_sql(cls, db): ''' - Returns the SQL command for creating a table for this model. + Returns the SQL statement for creating a table for this model. ''' parts = ['CREATE TABLE IF NOT EXISTS `%s`.`%s` (' % (db.db_name, cls.table_name())] cols = [] for name, field in cls.fields().items(): cols.append(' %s %s' % (name, field.get_sql(db=db))) parts.append(',\n'.join(cols)) + parts.append(cls._constraints_sql()) parts.append(')') parts.append('ENGINE = ' + cls.engine.create_table_sql(db)) return '\n'.join(parts) + @classmethod + def _constraints_sql(cls): + ''' + Returns this model's contraints as SQL. + ''' + if not cls._constraints: + return '' + return ',' + ',\n'.join(c.create_table_sql() for c in cls._constraints.values()) + @classmethod def drop_table_sql(cls, db): ''' @@ -348,7 +388,7 @@ class BufferModel(Model): @classmethod def create_table_sql(cls, db): ''' - Returns the SQL command for creating a table for this model. + Returns the SQL statement for creating a table for this model. ''' parts = ['CREATE TABLE IF NOT EXISTS `%s`.`%s` AS `%s`.`%s`' % (db.db_name, cls.table_name(), db.db_name, cls.engine.main_model.table_name())] @@ -370,6 +410,9 @@ class MergeModel(Model): @classmethod def create_table_sql(cls, db): + ''' + Returns the SQL statement for creating a table for this model. + ''' assert isinstance(cls.engine, Merge), "engine must be an instance of engines.Merge" parts = ['CREATE TABLE IF NOT EXISTS `%s`.`%s` (' % (db.db_name, cls.table_name())] cols = [] @@ -377,6 +420,7 @@ class MergeModel(Model): if name != '_table': cols.append(' %s %s' % (name, field.get_sql(db=db))) parts.append(',\n'.join(cols)) + parts.append(cls._constraints_sql()) parts.append(')') parts.append('ENGINE = ' + cls.engine.create_table_sql(db)) return '\n'.join(parts) @@ -386,10 +430,14 @@ class MergeModel(Model): class DistributedModel(Model): """ - Model for Distributed engine + Model class for use with a `Distributed` engine. """ def set_database(self, db): + ''' + Sets the `Database` that this model instance belongs to. + This is done automatically when the instance is read from the database or written to it. + ''' assert isinstance(self.engine, Distributed), "engine must be an instance of engines.Distributed" res = super(DistributedModel, self).set_database(db) return res @@ -447,6 +495,9 @@ class DistributedModel(Model): @classmethod def create_table_sql(cls, db): + ''' + Returns the SQL statement for creating a table for this model. + ''' assert isinstance(cls.engine, Distributed), "engine must be engines.Distributed instance" cls.fix_engine_table() @@ -459,4 +510,4 @@ class DistributedModel(Model): # Expose only relevant classes in import * -__all__ = get_subclass_names(locals(), Model) +__all__ = get_subclass_names(locals(), (Model, Constraint)) diff --git a/tests/sample_migrations/0016.py b/tests/sample_migrations/0016.py new file mode 100644 index 0000000..6f0f814 --- /dev/null +++ b/tests/sample_migrations/0016.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.CreateTable(ModelWithConstraints) +] diff --git a/tests/sample_migrations/0017.py b/tests/sample_migrations/0017.py new file mode 100644 index 0000000..4151189 --- /dev/null +++ b/tests/sample_migrations/0017.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.AlterConstraints(ModelWithConstraints2) +] diff --git a/tests/test_constraints.py b/tests/test_constraints.py new file mode 100644 index 0000000..a14ed6c --- /dev/null +++ b/tests/test_constraints.py @@ -0,0 +1,45 @@ +import unittest + +from infi.clickhouse_orm import * +from .base_test_with_data import Person + + +class ArrayFieldsTest(unittest.TestCase): + + def setUp(self): + self.database = Database('test-db', log_statements=True) + self.database.create_table(PersonWithConstraints) + + def tearDown(self): + self.database.drop_database() + + def test_insert_valid_values(self): + self.database.insert([ + PersonWithConstraints(first_name="Mike", last_name="Caruzo", birthday="2000-01-01", height=1.66) + ]) + + def test_insert_invalid_values(self): + if self.database.server_version < (19, 14, 3, 3): + raise unittest.SkipTest('ClickHouse version too old') + + with self.assertRaises(ServerError) as e: + self.database.insert([ + PersonWithConstraints(first_name="Mike", last_name="Caruzo", birthday="2100-01-01", height=1.66) + ]) + self.assertEqual(e.code, 469) + self.assertTrue('Constraint `birthday_in_the_past`' in e.message) + + with self.assertRaises(ServerError) as e: + self.database.insert([ + PersonWithConstraints(first_name="Mike", last_name="Caruzo", birthday="1970-01-01", height=3) + ]) + self.assertEqual(e.code, 469) + self.assertTrue('Constraint `max_height`' in e.message) + + +class PersonWithConstraints(Person): + + birthday_in_the_past = Constraint(Person.birthday <= F.today()) + max_height = Constraint(Person.height <= 2.75) + + diff --git a/tests/test_migrations.py b/tests/test_migrations.py index f92b1e9..e8d7776 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -1,8 +1,8 @@ from __future__ import unicode_literals import unittest -from infi.clickhouse_orm.database import Database -from infi.clickhouse_orm.models import Model, BufferModel +from infi.clickhouse_orm.database import Database, ServerError +from infi.clickhouse_orm.models import Model, BufferModel, Constraint from infi.clickhouse_orm.fields import * from infi.clickhouse_orm.engines import * from infi.clickhouse_orm.migrations import MigrationHistory @@ -94,6 +94,7 @@ class MigrationsTestCase(unittest.TestCase): self.assertTrue(self.tableExists(AliasModel1)) self.assertEqual(self.getTableFields(AliasModel1), [('date', 'Date'), ('int_field', 'Int8'), ('date_alias', 'Date'), ('int_field_plus_one', 'Int8')]) + # Codecs and low cardinality self.database.migrate('tests.sample_migrations', 15) self.assertTrue(self.tableExists(Model4_compressed)) if self.database.has_low_cardinality_support: @@ -106,6 +107,22 @@ class MigrationsTestCase(unittest.TestCase): [('date', 'Date'), ('f1', 'Int32'), ('f3', 'Float32'), ('f2', 'String'), ('f4', 'Nullable(String)'), ('f5', 'Array(UInt64)')]) + if self.database.server_version >= (19, 14, 3, 3): + # Adding constraints + self.database.migrate('tests.sample_migrations', 16) + self.assertTrue(self.tableExists(ModelWithConstraints)) + self.database.insert([ModelWithConstraints(f1=101, f2='a')]) + with self.assertRaises(ServerError): + self.database.insert([ModelWithConstraints(f1=99, f2='a')]) + with self.assertRaises(ServerError): + self.database.insert([ModelWithConstraints(f1=101, f2='x')]) + # Modifying constraints + self.database.migrate('tests.sample_migrations', 17) + self.database.insert([ModelWithConstraints(f1=99, f2='a')]) + with self.assertRaises(ServerError): + self.database.insert([ModelWithConstraints(f1=101, f2='a')]) + with self.assertRaises(ServerError): + self.database.insert([ModelWithConstraints(f1=99, f2='x')]) # Several different models with the same table name, to simulate a table that changes over time @@ -294,3 +311,36 @@ class Model2LowCardinality(Model): @classmethod def table_name(cls): return 'mig' + + +class ModelWithConstraints(Model): + + date = DateField() + f1 = Int32Field() + f2 = StringField() + + constraint = Constraint(f2.isIn(['a', 'b', 'c'])) # check reserved keyword as constraint name + f1_constraint = Constraint(f1 > 100) + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'modelwithconstraints' + + +class ModelWithConstraints2(Model): + + date = DateField() + f1 = Int32Field() + f2 = StringField() + + constraint = Constraint(f2.isIn(['a', 'b', 'c'])) + f1_constraint_new = Constraint(f1 < 100) + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'modelwithconstraints' + From 22cd908a49425bb3cbdcf4c88dfb3fdc8e2847ff Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sat, 6 Jun 2020 11:37:25 +0300 Subject: [PATCH 03/27] Support for model constraints --- tests/test_constraints.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_constraints.py b/tests/test_constraints.py index a14ed6c..1b5892d 100644 --- a/tests/test_constraints.py +++ b/tests/test_constraints.py @@ -4,10 +4,12 @@ from infi.clickhouse_orm import * from .base_test_with_data import Person -class ArrayFieldsTest(unittest.TestCase): +class ConstraintsTest(unittest.TestCase): def setUp(self): self.database = Database('test-db', log_statements=True) + if self.database.server_version < (19, 14, 3, 3): + raise unittest.SkipTest('ClickHouse version too old') self.database.create_table(PersonWithConstraints) def tearDown(self): @@ -19,9 +21,6 @@ class ArrayFieldsTest(unittest.TestCase): ]) def test_insert_invalid_values(self): - if self.database.server_version < (19, 14, 3, 3): - raise unittest.SkipTest('ClickHouse version too old') - with self.assertRaises(ServerError) as e: self.database.insert([ PersonWithConstraints(first_name="Mike", last_name="Caruzo", birthday="2100-01-01", height=1.66) From 635197de38f9d8f7ca37c96c6e13cd67bc62dc4c Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sat, 6 Jun 2020 20:56:32 +0300 Subject: [PATCH 04/27] Support for data skipping indexes --- CHANGELOG.md | 1 + docs/class_reference.md | 74 ++++++++++++- docs/models_and_databases.md | 60 +++++++---- docs/toc.md | 2 + scripts/generate_ref.py | 2 +- src/infi/clickhouse_orm/funcs.py | 39 +------ src/infi/clickhouse_orm/migrations.py | 65 ++++++++++-- src/infi/clickhouse_orm/models.py | 146 ++++++++++++++++++++------ src/infi/clickhouse_orm/utils.py | 36 ++++++- tests/sample_migrations/0018.py | 6 ++ tests/sample_migrations/0019.py | 6 ++ tests/test_indexes.py | 32 ++++++ tests/test_migrations.py | 112 ++++++++++++++------ 13 files changed, 446 insertions(+), 135 deletions(-) create mode 100644 tests/sample_migrations/0018.py create mode 100644 tests/sample_migrations/0019.py create mode 100644 tests/test_indexes.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b6165b4..054bf18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Change Log Unreleased ---------- - Support for model constraints +- Support for data skipping indexes v2.0.1 ------ diff --git a/docs/class_reference.md b/docs/class_reference.md index 616863f..bb4b099 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -731,16 +731,84 @@ Defines a model constraint. #### Constraint(expr) -Initializer. Requires an expression that ClickHouse will verify when inserting data. +Initializer. Expects an expression that ClickHouse will verify when inserting data. #### create_table_sql() -Returns the SQL statement for defining this constraint on table creation. +Returns the SQL statement for defining this constraint during table creation. -#### str() +### Index + + +Defines a data-skipping index. + +#### Index(expr, type, granularity) + + +Initializer. + +- `expr` - a column, expression, or tuple of columns and expressions to index. +- `type` - the index type. Use one of the following methods to specify the type: + `Index.minmax`, `Index.set`, `Index.ngrambf_v1`, `Index.tokenbf_v1` or `Index.bloom_filter`. +- `granularity` - index block size (number of multiples of the `index_granularity` defined by the engine). + + +#### bloom_filter() + + +An index that stores a Bloom filter containing values of the index expression. + +- `false_positive` - the probability (between 0 and 1) of receiving a false positive + response from the filter + + +#### create_table_sql() + + +Returns the SQL statement for defining this index during table creation. + + +#### minmax() + + +An index that stores extremes of the specified expression (if the expression is tuple, then it stores +extremes for each element of tuple). The stored info is used for skipping blocks of data like the primary key. + + +#### ngrambf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed) + + +An index that stores a Bloom filter containing all ngrams from a block of data. +Works only with strings. Can be used for optimization of equals, like and in expressions. + +- `n` — ngram size +- `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here, + for example 256 or 512, because it can be compressed well). +- `number_of_hash_functions` — The number of hash functions used in the Bloom filter. +- `random_seed` — The seed for Bloom filter hash functions. + + +#### set() + + +An index that stores unique values of the specified expression (no more than max_rows rows, +or unlimited if max_rows=0). Uses the values to check if the WHERE expression is not satisfiable +on a block of data. + + +#### tokenbf_v1(number_of_hash_functions, random_seed) + + +An index that stores a Bloom filter containing string tokens. Tokens are sequences +separated by non-alphanumeric characters. + +- `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here, + for example 256 or 512, because it can be compressed well). +- `number_of_hash_functions` — The number of hash functions used in the Bloom filter. +- `random_seed` — The seed for Bloom filter hash functions. infi.clickhouse_orm.fields diff --git a/docs/models_and_databases.md b/docs/models_and_databases.md index 928489d..28ab6ad 100644 --- a/docs/models_and_databases.md +++ b/docs/models_and_databases.md @@ -9,17 +9,18 @@ Defining Models --------------- Models are defined in a way reminiscent of Django's ORM, by subclassing `Model`: +```python +from infi.clickhouse_orm import Model, StringField, DateField, Float32Field, MergeTree - from infi.clickhouse_orm import Model, StringField, DateField, Float32Field, MergeTree +class Person(Model): - class Person(Model): + first_name = StringField() + last_name = StringField() + birthday = DateField() + height = Float32Field() - first_name = StringField() - last_name = StringField() - birthday = DateField() - height = Float32Field() - - engine = MergeTree('birthday', ('first_name', 'last_name', 'birthday')) + engine = MergeTree('birthday', ('first_name', 'last_name', 'birthday')) +``` The columns in the database table are represented by model fields. Each field has a type, which matches the type of the corresponding database column. All the supported fields types are listed [here](field_types.md). @@ -66,32 +67,45 @@ For additional details see [here](field_options.md). ### Table Names The table name used for the model is its class name, converted to lowercase. To override the default name, implement the `table_name` method: +```python +class Person(Model): - class Person(Model): + ... - ... - - @classmethod - def table_name(cls): - return 'people' + @classmethod + def table_name(cls): + return 'people' +``` ### Model Constraints It is possible to define constraints which ClickHouse verifies when data is inserted. Trying to insert invalid records will raise a `ServerError`. Each constraint has a name and an expression to validate. For example: +```python +class Person(Model): - from infi.clickhouse_orm import Model, Constraint, F, StringField, DateField, Float32Field, MergeTree + ... - class Person(Model): + # Ensure that the birthday is not a future date + birthday_is_in_the_past = Constraint(birthday <= F.today()) +``` - first_name = StringField() - last_name = StringField() - birthday = DateField() - height = Float32Field() +### Data Skipping Indexes - # Ensure that the birthday is not a future date - birthday_is_in_the_past = Constraint(birthday <= F.today()) +Models that use an engine from the `MergeTree` family can define additional indexes over one or more columns or expressions. These indexes are used in SELECT queries for reducing the amount of data to read from the disk by skipping big blocks of data that do not satisfy the query's conditions. - engine = MergeTree('birthday', ('first_name', 'last_name', 'birthday')) +For example: +```python +class Person(Model): + + ... + + # A minmax index that can help find people taller or shorter than some height + height_index = Index(height, type=Index.minmax(), granularity=2) + + # A trigram index that can help find substrings inside people names + names_index = Index((F.lower(first_name), F.lower(last_name)), + type=Index.ngrambf_v1(3, 256, 2, 0), granularity=1) +``` Using Models diff --git a/docs/toc.md b/docs/toc.md index 6dd5da0..36a20e7 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -11,6 +11,7 @@ * [Alias fields](models_and_databases.md#alias-fields) * [Table Names](models_and_databases.md#table-names) * [Model Constraints](models_and_databases.md#model-constraints) + * [Data Skipping Indexes](models_and_databases.md#data-skipping-indexes) * [Using Models](models_and_databases.md#using-models) * [Inserting to the Database](models_and_databases.md#inserting-to-the-database) * [Reading from the Database](models_and_databases.md#reading-from-the-database) @@ -85,6 +86,7 @@ * [MergeModel](class_reference.md#mergemodel) * [DistributedModel](class_reference.md#distributedmodel) * [Constraint](class_reference.md#constraint) + * [Index](class_reference.md#index) * [infi.clickhouse_orm.fields](class_reference.md#inficlickhouse_ormfields) * [ArrayField](class_reference.md#arrayfield) * [BaseEnumField](class_reference.md#baseenumfield) diff --git a/scripts/generate_ref.py b/scripts/generate_ref.py index 8dc0477..22850ab 100644 --- a/scripts/generate_ref.py +++ b/scripts/generate_ref.py @@ -132,7 +132,7 @@ if __name__ == '__main__': print('===============') print() module_doc([database.Database, database.DatabaseException]) - module_doc([models.Model, models.BufferModel, models.MergeModel, models.DistributedModel, models.Constraint]) + module_doc([models.Model, models.BufferModel, models.MergeModel, models.DistributedModel, models.Constraint, models.Index]) module_doc(sorted([fields.Field] + all_subclasses(fields.Field), key=lambda x: x.__name__), False) module_doc([engines.Engine] + all_subclasses(engines.Engine), False) module_doc([query.QuerySet, query.AggregateQuerySet, query.Q]) diff --git a/src/infi/clickhouse_orm/funcs.py b/src/infi/clickhouse_orm/funcs.py index 8d59528..080c3d7 100644 --- a/src/infi/clickhouse_orm/funcs.py +++ b/src/infi/clickhouse_orm/funcs.py @@ -1,9 +1,8 @@ -from datetime import date, datetime, tzinfo, timedelta from functools import wraps from inspect import signature, Parameter from types import FunctionType -from .utils import is_iterable, comma_join, NO_VALUE +from .utils import is_iterable, comma_join, NO_VALUE, arg_to_sql from .query import Cond, QuerySet @@ -263,43 +262,9 @@ class F(Cond, FunctionOperatorsMixin, metaclass=FMeta): else: prefix = self.name sep = ', ' - arg_strs = (F._arg_to_sql(arg) for arg in self.args if arg != NO_VALUE) + arg_strs = (arg_to_sql(arg) for arg in self.args if arg != NO_VALUE) return prefix + '(' + sep.join(arg_strs) + ')' - @staticmethod - def _arg_to_sql(arg): - """ - Converts a function argument to SQL string according to its type. - Supports functions, model fields, strings, dates, datetimes, timedeltas, booleans, - None, numbers, timezones, arrays/iterables. - """ - from .fields import Field, StringField, DateTimeField, DateField - if isinstance(arg, F): - return arg.to_sql() - if isinstance(arg, Field): - return "`%s`" % arg - if isinstance(arg, str): - return StringField().to_db_string(arg) - if isinstance(arg, datetime): - return "toDateTime(%s)" % DateTimeField().to_db_string(arg) - if isinstance(arg, date): - return "toDate('%s')" % arg.isoformat() - if isinstance(arg, timedelta): - return "toIntervalSecond(%d)" % int(arg.total_seconds()) - if isinstance(arg, bool): - return str(int(arg)) - if isinstance(arg, tzinfo): - return StringField().to_db_string(arg.tzname(None)) - if arg is None: - return 'NULL' - if isinstance(arg, QuerySet): - return "(%s)" % arg - if isinstance(arg, tuple): - return '(' + comma_join(F._arg_to_sql(x) for x in arg) + ')' - if is_iterable(arg): - return '[' + comma_join(F._arg_to_sql(x) for x in arg) + ']' - return str(arg) - # Arithmetic functions @staticmethod diff --git a/src/infi/clickhouse_orm/migrations.py b/src/infi/clickhouse_orm/migrations.py index 5361b2b..0ff3f59 100644 --- a/src/infi/clickhouse_orm/migrations.py +++ b/src/infi/clickhouse_orm/migrations.py @@ -26,12 +26,13 @@ class ModelOperation(Operation): Initializer. ''' self.model_class = model_class + self.table_name = model_class.table_name() def _alter_table(self, database, cmd): ''' Utility for running ALTER TABLE commands. ''' - cmd = "ALTER TABLE $db.`%s` %s" % (self.model_class.table_name(), cmd) + cmd = "ALTER TABLE $db.`%s` %s" % (self.table_name, cmd) logger.debug(cmd) database.raw(cmd) @@ -42,7 +43,7 @@ class CreateTable(ModelOperation): ''' def apply(self, database): - logger.info(' Create table %s', self.model_class.table_name()) + logger.info(' Create table %s', self.table_name) if issubclass(self.model_class, BufferModel): database.create_table(self.model_class.engine.main_model) database.create_table(self.model_class) @@ -59,11 +60,11 @@ class AlterTable(ModelOperation): ''' def _get_table_fields(self, database): - query = "DESC `%s`.`%s`" % (database.db_name, self.model_class.table_name()) + query = "DESC `%s`.`%s`" % (database.db_name, self.table_name) return [(row.name, row.type) for row in database.select(query)] def apply(self, database): - logger.info(' Alter table %s', self.model_class.table_name()) + logger.info(' Alter table %s', self.table_name) # Note that MATERIALIZED and ALIAS fields are always at the end of the DESC, # ADD COLUMN ... AFTER doesn't affect it @@ -131,7 +132,7 @@ class DropTable(ModelOperation): ''' def apply(self, database): - logger.info(' Drop table %s', self.model_class.table_name()) + logger.info(' Drop table %s', self.table_name) database.drop_table(self.model_class) @@ -144,7 +145,7 @@ class AlterConstraints(ModelOperation): ''' def apply(self, database): - logger.info(' Alter constraints for %s', self.model_class.table_name()) + logger.info(' Alter constraints for %s', self.table_name) existing = self._get_constraint_names(database) # Go over constraints in the model for constraint in self.model_class._constraints.values(): @@ -164,8 +165,56 @@ class AlterConstraints(ModelOperation): Returns a set containing the names of existing constraints in the table. ''' import re - create_table_sql = database.raw('SHOW CREATE TABLE $db.`%s`' % self.model_class.table_name()) - matches = re.findall(r'\sCONSTRAINT\s+`?(.+?)`?\s+CHECK\s', create_table_sql, flags=re.IGNORECASE) + table_def = database.raw('SHOW CREATE TABLE $db.`%s`' % self.table_name) + matches = re.findall(r'\sCONSTRAINT\s+`?(.+?)`?\s+CHECK\s', table_def) + return set(matches) + + +class AlterIndexes(ModelOperation): + ''' + A migration operation that adds new indexes from the model to the database + table, and drops obsolete ones. Indexes are identified by their names, so + a change in an existing index will not be detected unless its name was changed too. + ''' + + def __init__(self, model_class, reindex=False): + ''' + Initializer. + By default ClickHouse does not build indexes over existing data, only for + new data. Passing `reindex=True` will run `OPTIMIZE TABLE` in order to build + the indexes over the existing data. + ''' + super().__init__(model_class) + self.reindex = reindex + + def apply(self, database): + logger.info(' Alter indexes for %s', self.table_name) + existing = self._get_index_names(database) + logger.info(existing) + # Go over indexes in the model + for index in self.model_class._indexes.values(): + # Check if it's a new index + if index.name not in existing: + logger.info(' Add index %s', index.name) + self._alter_table(database, 'ADD %s' % index.create_table_sql()) + else: + existing.remove(index.name) + # Remaining indexes in `existing` are obsolete + for name in existing: + logger.info(' Drop index %s', name) + self._alter_table(database, 'DROP INDEX `%s`' % name) + # Reindex + if self.reindex: + logger.info(' Build indexes on table') + self.database.raw('OPTIMIZE TABLE $db.`%s` FINAL' % self.table_name) + + def _get_index_names(self, database): + ''' + Returns a set containing the names of existing indexes in the table. + ''' + import re + table_def = database.raw('SHOW CREATE TABLE $db.`%s`' % self.table_name) + matches = re.findall(r'\sINDEX\s+`?(.+?)`?\s+', table_def) return set(matches) diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index 506185b..cc535a1 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -7,7 +7,7 @@ from logging import getLogger import pytz from .fields import Field, StringField -from .utils import parse_tsv, NO_VALUE, get_subclass_names +from .utils import parse_tsv, NO_VALUE, get_subclass_names, arg_to_sql from .query import QuerySet from .funcs import F from .engines import Merge, Distributed @@ -16,7 +16,7 @@ logger = getLogger('clickhouse_orm') -class Constraint(): +class Constraint: ''' Defines a model constraint. ''' @@ -34,10 +34,89 @@ class Constraint(): ''' Returns the SQL statement for defining this constraint during table creation. ''' - return 'CONSTRAINT `%s` CHECK %s' % (self.name, self.expr) + return 'CONSTRAINT `%s` CHECK %s' % (self.name, arg_to_sql(self.expr)) - def str(self): - return self.create_table_sql() + +class Index: + ''' + Defines a data-skipping index. + ''' + + name = None # this is set by the parent model + parent = None # this is set by the parent model + + def __init__(self, expr, type, granularity): + ''' + Initializer. + + - `expr` - a column, expression, or tuple of columns and expressions to index. + - `type` - the index type. Use one of the following methods to specify the type: + `Index.minmax`, `Index.set`, `Index.ngrambf_v1`, `Index.tokenbf_v1` or `Index.bloom_filter`. + - `granularity` - index block size (number of multiples of the `index_granularity` defined by the engine). + ''' + self.expr = expr + self.type = type + self.granularity = granularity + + def create_table_sql(self): + ''' + Returns the SQL statement for defining this index during table creation. + ''' + return 'INDEX `%s` %s TYPE %s GRANULARITY %d' % (self.name, arg_to_sql(self.expr), self.type, self.granularity) + + @staticmethod + def minmax(): + ''' + An index that stores extremes of the specified expression (if the expression is tuple, then it stores + extremes for each element of tuple). The stored info is used for skipping blocks of data like the primary key. + ''' + return 'minmax' + + @staticmethod + def set(max_rows): + ''' + An index that stores unique values of the specified expression (no more than max_rows rows, + or unlimited if max_rows=0). Uses the values to check if the WHERE expression is not satisfiable + on a block of data. + ''' + return 'set(%d)' % max_rows + + @staticmethod + def ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed): + ''' + An index that stores a Bloom filter containing all ngrams from a block of data. + Works only with strings. Can be used for optimization of equals, like and in expressions. + + - `n` — ngram size + - `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here, + for example 256 or 512, because it can be compressed well). + - `number_of_hash_functions` — The number of hash functions used in the Bloom filter. + - `random_seed` — The seed for Bloom filter hash functions. + ''' + return 'ngrambf_v1(%d, %d, %d, %d)' % (n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed) + + @staticmethod + def tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed): + ''' + An index that stores a Bloom filter containing string tokens. Tokens are sequences + separated by non-alphanumeric characters. + + - `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here, + for example 256 or 512, because it can be compressed well). + - `number_of_hash_functions` — The number of hash functions used in the Bloom filter. + - `random_seed` — The seed for Bloom filter hash functions. + ''' + return 'tokenbf_v1(%d, %d, %d)' % (size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed) + + @staticmethod + def bloom_filter(false_positive=0.025): + ''' + An index that stores a Bloom filter containing values of the index expression. + + - `false_positive` - the probability (between 0 and 1) of receiving a false positive + response from the filter + ''' + return 'bloom_filter(%f)' % false_positive class ModelBase(type): @@ -48,21 +127,29 @@ class ModelBase(type): ad_hoc_model_cache = {} def __new__(cls, name, bases, attrs): - # Collect fields and constraints from parent classes - fields = dict() - constraints = dict() + + # Collect fields, constraints and indexes from parent classes + fields = {} + constraints = {} + indexes = {} for base in bases: if isinstance(base, ModelBase): fields.update(base._fields) constraints.update(base._constraints) + indexes.update(base._indexes) - # Build a list of (name, field) tuples, in the order they were listed in the class - fields.update({n: f for n, f in attrs.items() if isinstance(f, Field)}) + # Add fields, constraints and indexes from this class + for n, obj in attrs.items(): + if isinstance(obj, Field): + fields[n] = obj + elif isinstance(obj, Constraint): + constraints[n] = obj + elif isinstance(obj, Index): + indexes[n] = obj + + # Convert fields to a list of (name, field) tuples, in the order they were listed in the class fields = sorted(fields.items(), key=lambda item: item[1].creation_counter) - # Build a list of constraints - constraints.update({n: c for n, c in attrs.items() if isinstance(c, Constraint)}) - # Build a dictionary of default values defaults = {} has_funcs_as_defaults = False @@ -75,18 +162,20 @@ class ModelBase(type): else: defaults[n] = f.to_python(f.default, pytz.UTC) + # Create the model class attrs = dict( attrs, _fields=OrderedDict(fields), _constraints=constraints, + _indexes=indexes, _writable_fields=OrderedDict([f for f in fields if not f[1].readonly]), _defaults=defaults, _has_funcs_as_defaults=has_funcs_as_defaults ) model = super(ModelBase, cls).__new__(cls, str(name), bases, attrs) - # Let each field and constraint know its parent and its own name - for n, obj in chain(fields, constraints.items()): + # Let each field, constraint and index know its parent and its own name + for n, obj in chain(fields, constraints.items(), indexes.items()): setattr(obj, 'parent', model) setattr(obj, 'name', n) @@ -255,24 +344,22 @@ class Model(metaclass=ModelBase): Returns the SQL statement for creating a table for this model. ''' parts = ['CREATE TABLE IF NOT EXISTS `%s`.`%s` (' % (db.db_name, cls.table_name())] - cols = [] + # Fields + items = [] for name, field in cls.fields().items(): - cols.append(' %s %s' % (name, field.get_sql(db=db))) - parts.append(',\n'.join(cols)) - parts.append(cls._constraints_sql()) + items.append(' %s %s' % (name, field.get_sql(db=db))) + # Constraints + for c in cls._constraints.values(): + items.append(' %s' % c.create_table_sql()) + # Indexes + for i in cls._indexes.values(): + items.append(' %s' % i.create_table_sql()) + parts.append(',\n'.join(items)) + # Engine parts.append(')') parts.append('ENGINE = ' + cls.engine.create_table_sql(db)) return '\n'.join(parts) - @classmethod - def _constraints_sql(cls): - ''' - Returns this model's contraints as SQL. - ''' - if not cls._constraints: - return '' - return ',' + ',\n'.join(c.create_table_sql() for c in cls._constraints.values()) - @classmethod def drop_table_sql(cls, db): ''' @@ -420,7 +507,6 @@ class MergeModel(Model): if name != '_table': cols.append(' %s %s' % (name, field.get_sql(db=db))) parts.append(',\n'.join(cols)) - parts.append(cls._constraints_sql()) parts.append(')') parts.append('ENGINE = ' + cls.engine.create_table_sql(db)) return '\n'.join(parts) @@ -510,4 +596,4 @@ class DistributedModel(Model): # Expose only relevant classes in import * -__all__ = get_subclass_names(locals(), (Model, Constraint)) +__all__ = get_subclass_names(locals(), (Model, Constraint, Index)) diff --git a/src/infi/clickhouse_orm/utils.py b/src/infi/clickhouse_orm/utils.py index a487bfe..c0d0325 100644 --- a/src/infi/clickhouse_orm/utils.py +++ b/src/infi/clickhouse_orm/utils.py @@ -1,6 +1,6 @@ -from __future__ import unicode_literals import codecs import re +from datetime import date, datetime, tzinfo, timedelta SPECIAL_CHARS = { @@ -42,6 +42,40 @@ def string_or_func(obj): return obj.to_sql() if hasattr(obj, 'to_sql') else obj +def arg_to_sql(arg): + """ + Converts a function argument to SQL string according to its type. + Supports functions, model fields, strings, dates, datetimes, timedeltas, booleans, + None, numbers, timezones, arrays/iterables. + """ + from infi.clickhouse_orm import Field, StringField, DateTimeField, DateField, F, QuerySet + if isinstance(arg, F): + return arg.to_sql() + if isinstance(arg, Field): + return "`%s`" % arg + if isinstance(arg, str): + return StringField().to_db_string(arg) + if isinstance(arg, datetime): + return "toDateTime(%s)" % DateTimeField().to_db_string(arg) + if isinstance(arg, date): + return "toDate('%s')" % arg.isoformat() + if isinstance(arg, timedelta): + return "toIntervalSecond(%d)" % int(arg.total_seconds()) + if isinstance(arg, bool): + return str(int(arg)) + if isinstance(arg, tzinfo): + return StringField().to_db_string(arg.tzname(None)) + if arg is None: + return 'NULL' + if isinstance(arg, QuerySet): + return "(%s)" % arg + if isinstance(arg, tuple): + return '(' + comma_join(arg_to_sql(x) for x in arg) + ')' + if is_iterable(arg): + return '[' + comma_join(arg_to_sql(x) for x in arg) + ']' + return str(arg) + + def parse_tsv(line): if isinstance(line, bytes): line = line.decode() diff --git a/tests/sample_migrations/0018.py b/tests/sample_migrations/0018.py new file mode 100644 index 0000000..c34c137 --- /dev/null +++ b/tests/sample_migrations/0018.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.CreateTable(ModelWithIndex) +] diff --git a/tests/sample_migrations/0019.py b/tests/sample_migrations/0019.py new file mode 100644 index 0000000..67ba244 --- /dev/null +++ b/tests/sample_migrations/0019.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.AlterIndexes(ModelWithIndex2, reindex=True) +] diff --git a/tests/test_indexes.py b/tests/test_indexes.py new file mode 100644 index 0000000..2507e92 --- /dev/null +++ b/tests/test_indexes.py @@ -0,0 +1,32 @@ +import unittest + +from infi.clickhouse_orm import * + + +class IndexesTest(unittest.TestCase): + + def setUp(self): + self.database = Database('test-db', log_statements=True) + if self.database.server_version < (19, 3, 3): + raise unittest.SkipTest('ClickHouse version too old') + + def tearDown(self): + self.database.drop_database() + + def test_all_index_types(self): + self.database.create_table(ModelWithIndexes) + + +class ModelWithIndexes(Model): + + date = DateField() + f1 = Int32Field() + f2 = StringField() + + i1 = Index(f1, type=Index.minmax(), granularity=1) + i2 = Index(f1, type=Index.set(1000), granularity=2) + i3 = Index(f2, type=Index.ngrambf_v1(3, 256, 2, 0), granularity=1) + i4 = Index(F.lower(f2), type=Index.tokenbf_v1(256, 2, 0), granularity=2) + i5 = Index((F.toQuarter(date), f2), type=Index.bloom_filter(), granularity=3) + + engine = MergeTree('date', ('date',)) diff --git a/tests/test_migrations.py b/tests/test_migrations.py index e8d7776..61b06b8 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import unittest from infi.clickhouse_orm.database import Database, ServerError -from infi.clickhouse_orm.models import Model, BufferModel, Constraint +from infi.clickhouse_orm.models import Model, BufferModel, Constraint, Index from infi.clickhouse_orm.fields import * from infi.clickhouse_orm.engines import * from infi.clickhouse_orm.migrations import MigrationHistory @@ -27,55 +27,58 @@ class MigrationsTestCase(unittest.TestCase): def tearDown(self): self.database.drop_database() - def tableExists(self, model_class): + def table_exists(self, model_class): query = "EXISTS TABLE $db.`%s`" % model_class.table_name() return next(self.database.select(query)).result == 1 - def getTableFields(self, model_class): + def get_table_fields(self, model_class): query = "DESC `%s`.`%s`" % (self.database.db_name, model_class.table_name()) return [(row.name, row.type) for row in self.database.select(query)] + def get_table_def(self, model_class): + return self.database.raw('SHOW CREATE TABLE $db.`%s`' % self.table_name) + def test_migrations(self): # Creation and deletion of table self.database.migrate('tests.sample_migrations', 1) - self.assertTrue(self.tableExists(Model1)) + self.assertTrue(self.table_exists(Model1)) self.database.migrate('tests.sample_migrations', 2) - self.assertFalse(self.tableExists(Model1)) + self.assertFalse(self.table_exists(Model1)) self.database.migrate('tests.sample_migrations', 3) - self.assertTrue(self.tableExists(Model1)) + self.assertTrue(self.table_exists(Model1)) # Adding, removing and altering simple fields - self.assertEqual(self.getTableFields(Model1), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) + self.assertEqual(self.get_table_fields(Model1), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) self.database.migrate('tests.sample_migrations', 4) - self.assertEqual(self.getTableFields(Model2), [('date', 'Date'), ('f1', 'Int32'), ('f3', 'Float32'), ('f2', 'String'), ('f4', 'String'), ('f5', 'Array(UInt64)')]) + self.assertEqual(self.get_table_fields(Model2), [('date', 'Date'), ('f1', 'Int32'), ('f3', 'Float32'), ('f2', 'String'), ('f4', 'String'), ('f5', 'Array(UInt64)')]) self.database.migrate('tests.sample_migrations', 5) - self.assertEqual(self.getTableFields(Model3), [('date', 'Date'), ('f1', 'Int64'), ('f3', 'Float64'), ('f4', 'String')]) + self.assertEqual(self.get_table_fields(Model3), [('date', 'Date'), ('f1', 'Int64'), ('f3', 'Float64'), ('f4', 'String')]) # Altering enum fields self.database.migrate('tests.sample_migrations', 6) - self.assertTrue(self.tableExists(EnumModel1)) - self.assertEqual(self.getTableFields(EnumModel1), + self.assertTrue(self.table_exists(EnumModel1)) + self.assertEqual(self.get_table_fields(EnumModel1), [('date', 'Date'), ('f1', "Enum8('dog' = 1, 'cat' = 2, 'cow' = 3)")]) self.database.migrate('tests.sample_migrations', 7) - self.assertTrue(self.tableExists(EnumModel1)) - self.assertEqual(self.getTableFields(EnumModel2), + self.assertTrue(self.table_exists(EnumModel1)) + self.assertEqual(self.get_table_fields(EnumModel2), [('date', 'Date'), ('f1', "Enum16('dog' = 1, 'cat' = 2, 'horse' = 3, 'pig' = 4)")]) # Materialized fields and alias fields self.database.migrate('tests.sample_migrations', 8) - self.assertTrue(self.tableExists(MaterializedModel)) - self.assertEqual(self.getTableFields(MaterializedModel), + self.assertTrue(self.table_exists(MaterializedModel)) + self.assertEqual(self.get_table_fields(MaterializedModel), [('date_time', "DateTime"), ('date', 'Date')]) self.database.migrate('tests.sample_migrations', 9) - self.assertTrue(self.tableExists(AliasModel)) - self.assertEqual(self.getTableFields(AliasModel), + self.assertTrue(self.table_exists(AliasModel)) + self.assertEqual(self.get_table_fields(AliasModel), [('date', 'Date'), ('date_alias', "Date")]) # Buffer models creation and alteration self.database.migrate('tests.sample_migrations', 10) - self.assertTrue(self.tableExists(Model4)) - self.assertTrue(self.tableExists(Model4Buffer)) - self.assertEqual(self.getTableFields(Model4), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) - self.assertEqual(self.getTableFields(Model4Buffer), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) + self.assertTrue(self.table_exists(Model4)) + self.assertTrue(self.table_exists(Model4Buffer)) + self.assertEqual(self.get_table_fields(Model4), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) + self.assertEqual(self.get_table_fields(Model4Buffer), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')]) self.database.migrate('tests.sample_migrations', 11) - self.assertEqual(self.getTableFields(Model4), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')]) - self.assertEqual(self.getTableFields(Model4Buffer), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')]) + self.assertEqual(self.get_table_fields(Model4), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')]) + self.assertEqual(self.get_table_fields(Model4Buffer), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')]) self.database.migrate('tests.sample_migrations', 12) self.assertEqual(self.database.count(Model3), 3) @@ -88,29 +91,29 @@ class MigrationsTestCase(unittest.TestCase): self.assertListEqual(data, [1, 2, 3, 4]) self.database.migrate('tests.sample_migrations', 14) - self.assertTrue(self.tableExists(MaterializedModel1)) - self.assertEqual(self.getTableFields(MaterializedModel1), + self.assertTrue(self.table_exists(MaterializedModel1)) + self.assertEqual(self.get_table_fields(MaterializedModel1), [('date_time', 'DateTime'), ('int_field', 'Int8'), ('date', 'Date'), ('int_field_plus_one', 'Int8')]) - self.assertTrue(self.tableExists(AliasModel1)) - self.assertEqual(self.getTableFields(AliasModel1), + self.assertTrue(self.table_exists(AliasModel1)) + self.assertEqual(self.get_table_fields(AliasModel1), [('date', 'Date'), ('int_field', 'Int8'), ('date_alias', 'Date'), ('int_field_plus_one', 'Int8')]) # Codecs and low cardinality self.database.migrate('tests.sample_migrations', 15) - self.assertTrue(self.tableExists(Model4_compressed)) + self.assertTrue(self.table_exists(Model4_compressed)) if self.database.has_low_cardinality_support: - self.assertEqual(self.getTableFields(Model2LowCardinality), + self.assertEqual(self.get_table_fields(Model2LowCardinality), [('date', 'Date'), ('f1', 'LowCardinality(Int32)'), ('f3', 'LowCardinality(Float32)'), ('f2', 'LowCardinality(String)'), ('f4', 'LowCardinality(Nullable(String))'), ('f5', 'Array(LowCardinality(UInt64))')]) else: logging.warning('No support for low cardinality') - self.assertEqual(self.getTableFields(Model2), + self.assertEqual(self.get_table_fields(Model2), [('date', 'Date'), ('f1', 'Int32'), ('f3', 'Float32'), ('f2', 'String'), ('f4', 'Nullable(String)'), ('f5', 'Array(UInt64)')]) if self.database.server_version >= (19, 14, 3, 3): - # Adding constraints + # Creating constraints self.database.migrate('tests.sample_migrations', 16) - self.assertTrue(self.tableExists(ModelWithConstraints)) + self.assertTrue(self.table_exists(ModelWithConstraints)) self.database.insert([ModelWithConstraints(f1=101, f2='a')]) with self.assertRaises(ServerError): self.database.insert([ModelWithConstraints(f1=99, f2='a')]) @@ -124,6 +127,19 @@ class MigrationsTestCase(unittest.TestCase): with self.assertRaises(ServerError): self.database.insert([ModelWithConstraints(f1=99, f2='x')]) + if self.database.server_version < (19, 3, 3): + # Creating indexes + self.database.migrate('tests.sample_migrations', 18) + self.assertTrue(self.table_exists(ModelWithIndex)) + self.assertIn('INDEX `index`', self.get_table_def()) + self.assertIn('INDEX another_index', self.get_table_def()) + # Modifying indexes + self.database.migrate('tests.sample_migrations', 19) + self.assertNotIn('INDEX `index`', self.get_table_def()) + self.assertIn('INDEX index2', self.get_table_def()) + self.assertIn('INDEX another_index', self.get_table_def()) + + # Several different models with the same table name, to simulate a table that changes over time class Model1(Model): @@ -344,3 +360,35 @@ class ModelWithConstraints2(Model): def table_name(cls): return 'modelwithconstraints' + +class ModelWithIndex(Model): + + date = DateField() + f1 = Int32Field() + f2 = StringField() + + index = Index(f1, type=Index.minmax(), granularity=1) + another_index = Index(f2, type=Index.set(0), granularity=1) + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'modelwithindex' + + +class ModelWithIndex2(Model): + + date = DateField() + f1 = Int32Field() + f2 = StringField() + + index2 = Index(f1, type=Index.bloom_filter(), granularity=2) + another_index = Index(f2, type=Index.set(0), granularity=1) + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'modelwithindex' + From 4be1b0437f85803c5ac77ce89adfa4dbaf2550b6 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sat, 6 Jun 2020 21:06:29 +0300 Subject: [PATCH 05/27] Support for data skipping indexes --- tests/test_indexes.py | 2 +- tests/test_migrations.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index 2507e92..0dccaea 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -7,7 +7,7 @@ class IndexesTest(unittest.TestCase): def setUp(self): self.database = Database('test-db', log_statements=True) - if self.database.server_version < (19, 3, 3): + if self.database.server_version < (20, 1, 2, 4): raise unittest.SkipTest('ClickHouse version too old') def tearDown(self): diff --git a/tests/test_migrations.py b/tests/test_migrations.py index 61b06b8..4654f03 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -127,7 +127,7 @@ class MigrationsTestCase(unittest.TestCase): with self.assertRaises(ServerError): self.database.insert([ModelWithConstraints(f1=99, f2='x')]) - if self.database.server_version < (19, 3, 3): + if self.database.server_version < (20, 1, 2, 4): # Creating indexes self.database.migrate('tests.sample_migrations', 18) self.assertTrue(self.table_exists(ModelWithIndex)) From aaa1038a70e539278bc71dcf09bbff55aa50f3ca Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sat, 6 Jun 2020 21:21:18 +0300 Subject: [PATCH 06/27] Support for data skipping indexes --- tests/test_migrations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_migrations.py b/tests/test_migrations.py index 4654f03..c98cdba 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -127,7 +127,7 @@ class MigrationsTestCase(unittest.TestCase): with self.assertRaises(ServerError): self.database.insert([ModelWithConstraints(f1=99, f2='x')]) - if self.database.server_version < (20, 1, 2, 4): + if self.database.server_version >= (20, 1, 2, 4): # Creating indexes self.database.migrate('tests.sample_migrations', 18) self.assertTrue(self.table_exists(ModelWithIndex)) From 667cde168504d33b0d2ef5d7cca56fd78f29d51c Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sat, 6 Jun 2020 21:34:09 +0300 Subject: [PATCH 07/27] Support for data skipping indexes --- src/infi/clickhouse_orm/migrations.py | 2 +- tests/test_migrations.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/infi/clickhouse_orm/migrations.py b/src/infi/clickhouse_orm/migrations.py index 0ff3f59..c8c656a 100644 --- a/src/infi/clickhouse_orm/migrations.py +++ b/src/infi/clickhouse_orm/migrations.py @@ -206,7 +206,7 @@ class AlterIndexes(ModelOperation): # Reindex if self.reindex: logger.info(' Build indexes on table') - self.database.raw('OPTIMIZE TABLE $db.`%s` FINAL' % self.table_name) + database.raw('OPTIMIZE TABLE $db.`%s` FINAL' % self.table_name) def _get_index_names(self, database): ''' diff --git a/tests/test_migrations.py b/tests/test_migrations.py index c98cdba..fa5aef2 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -36,7 +36,7 @@ class MigrationsTestCase(unittest.TestCase): return [(row.name, row.type) for row in self.database.select(query)] def get_table_def(self, model_class): - return self.database.raw('SHOW CREATE TABLE $db.`%s`' % self.table_name) + return self.database.raw('SHOW CREATE TABLE $db.`%s`' % model_class.table_name()) def test_migrations(self): # Creation and deletion of table @@ -131,13 +131,13 @@ class MigrationsTestCase(unittest.TestCase): # Creating indexes self.database.migrate('tests.sample_migrations', 18) self.assertTrue(self.table_exists(ModelWithIndex)) - self.assertIn('INDEX `index`', self.get_table_def()) - self.assertIn('INDEX another_index', self.get_table_def()) + self.assertIn('INDEX index ', self.get_table_def(ModelWithIndex)) + self.assertIn('INDEX another_index ', self.get_table_def(ModelWithIndex)) # Modifying indexes self.database.migrate('tests.sample_migrations', 19) - self.assertNotIn('INDEX `index`', self.get_table_def()) - self.assertIn('INDEX index2', self.get_table_def()) - self.assertIn('INDEX another_index', self.get_table_def()) + self.assertNotIn('INDEX index ', self.get_table_def(ModelWithIndex)) + self.assertIn('INDEX index2 ', self.get_table_def(ModelWithIndex)) + self.assertIn('INDEX another_index ', self.get_table_def(ModelWithIndex)) # Several different models with the same table name, to simulate a table that changes over time From ed51ad5be6c7979b3109b293b663649fef4ee138 Mon Sep 17 00:00:00 2001 From: Niyaz Batyrshin Date: Sun, 7 Jun 2020 12:50:45 +0300 Subject: [PATCH 08/27] DateTime64 field closes #145 --- docs/class_reference.md | 7 ++++++ docs/field_types.md | 3 ++- docs/toc.md | 1 + src/infi/clickhouse_orm/fields.py | 42 +++++++++++++++++++++++++++++++ src/infi/clickhouse_orm/funcs.py | 5 ++++ src/infi/clickhouse_orm/models.py | 9 ++++++- tests/test_datetime_fields.py | 14 ++++++++--- tests/test_funcs.py | 1 + 8 files changed, 77 insertions(+), 5 deletions(-) diff --git a/docs/class_reference.md b/docs/class_reference.md index 285f9b4..1d1ee16 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -635,6 +635,13 @@ Extends Field #### DateTimeField(default=None, alias=None, materialized=None, readonly=None, codec=None) +### DateTime64Field + +Extends DateTimeField + +#### DateTime64Field(default=None, alias=None, materialized=None, readonly=None, codec=None, precision=6, timezone=None) + + ### Decimal128Field Extends DecimalField diff --git a/docs/field_types.md b/docs/field_types.md index 95e77e1..2bfea79 100644 --- a/docs/field_types.md +++ b/docs/field_types.md @@ -11,6 +11,7 @@ The following field types are supported: | FixedStringField | FixedString| str | Encoded as UTF-8 when written to ClickHouse | DateField | Date | datetime.date | Range 1970-01-01 to 2105-12-31 | DateTimeField | DateTime | datetime.datetime | Minimal value is 1970-01-01 00:00:00; Always in UTC +| DateTime64Field | DateTime64 | datetime.datetime | Minimal value is 1970-01-01 00:00:00; Always in UTC | Int8Field | Int8 | int | Range -128 to 127 | Int16Field | Int16 | int | Range -32768 to 32767 | Int32Field | Int32 | int | Range -2147483648 to 2147483647 @@ -185,4 +186,4 @@ class BooleanField(Field): --- -[<< Field Options](field_options.md) | [Table of Contents](toc.md) | [Table Engines >>](table_engines.md) \ No newline at end of file +[<< Field Options](field_options.md) | [Table of Contents](toc.md) | [Table Engines >>](table_engines.md) diff --git a/docs/toc.md b/docs/toc.md index 5805eaa..4da8cb9 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -82,6 +82,7 @@ * [BaseIntField](class_reference.md#baseintfield) * [DateField](class_reference.md#datefield) * [DateTimeField](class_reference.md#datetimefield) + * [DateTime64Field](class_reference.md#datetime64field) * [Decimal128Field](class_reference.md#decimal128field) * [Decimal32Field](class_reference.md#decimal32field) * [Decimal64Field](class_reference.md#decimal64field) diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 0ee90e9..6b7fee4 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -1,11 +1,14 @@ from __future__ import unicode_literals import datetime +from typing import List + import iso8601 import pytz from calendar import timegm from decimal import Decimal, localcontext from uuid import UUID from logging import getLogger +from pytz import UnknownTimeZoneError from .utils import escape, parse_array, comma_join, string_or_func, get_subclass_names from .funcs import F, FunctionOperatorsMixin from ipaddress import IPv4Address, IPv6Address @@ -86,10 +89,17 @@ class Field(FunctionOperatorsMixin): - `db`: Database, used for checking supported features. ''' sql = self.db_type + args = self.get_db_type_args() + if args: + sql += '(%s)' % ', '.join(args) if with_default_expression: sql += self._extra_params(db) return sql + def get_db_type_args(self) -> List[str]: + """Returns field type arguments""" + return [] + def _extra_params(self, db): sql = '' if self.alias: @@ -219,6 +229,38 @@ class DateTimeField(Field): return escape('%010d' % timegm(value.utctimetuple()), quote) +class DateTime64Field(DateTimeField): + db_type = 'DateTime64' + + def __init__(self, default=None, alias=None, materialized=None, readonly=None, codec=None, + precision: int = 6, timezone: str = None): + super().__init__(default, alias, materialized, readonly, codec) + assert precision is None or isinstance(precision, int), 'Precision must be int type' + assert timezone is None or isinstance(timezone, str), 'Timezone must be string type' + if timezone: + try: + pytz.timezone(timezone) + except UnknownTimeZoneError: + raise Exception('Timezone must be a valid IANA timezone identifier') + self.precision = precision + self.timezone = timezone + + def get_db_type_args(self) -> List[str]: + args = [str(self.precision)] + if self.timezone: + args.append(escape(self.timezone)) + return args + + def to_db_string(self, value: datetime.datetime, quote=True): + """ + Returns the field's value prepared for writing to the database + + Returns string in 0000000000.000000 format, where remainder digits count is equal to precision + """ + width = 11 + self.precision + return escape(f'{value.timestamp():0{width}.{self.precision}f}', quote) + + class BaseIntField(Field): ''' Abstract base class for all integer-type fields. diff --git a/src/infi/clickhouse_orm/funcs.py b/src/infi/clickhouse_orm/funcs.py index 8d59528..409022b 100644 --- a/src/infi/clickhouse_orm/funcs.py +++ b/src/infi/clickhouse_orm/funcs.py @@ -767,6 +767,11 @@ class F(Cond, FunctionOperatorsMixin, metaclass=FMeta): def toDateTime(x): return F('toDateTime', x) + @staticmethod + @type_conversion + def toDateTime64(x, precision, timezone=NO_VALUE): + return F('toDateTime64', x, precision, timezone) + @staticmethod def toString(x): return F('toString', x) diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index e4766e5..c335d53 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -6,7 +6,7 @@ from logging import getLogger import pytz from .fields import Field, StringField -from .utils import parse_tsv, NO_VALUE, get_subclass_names +from .utils import parse_tsv, NO_VALUE, get_subclass_names, unescape from .query import QuerySet from .funcs import F from .engines import Merge, Distributed @@ -89,6 +89,13 @@ class ModelBase(type): if db_type.startswith('DateTime('): # Some functions return DateTimeField with timezone in brackets return orm_fields.DateTimeField() + # DateTime with timezone + if db_type.startswith('DateTime64('): + precision, *timezone = [s.strip() for s in db_type[11:-1].split(',')] + return orm_fields.DateTime64Field( + precision=int(precision), + timezone=timezone[0][1:-1] if timezone else None + ) # Arrays if db_type.startswith('Array'): inner_field = cls.create_ad_hoc_field(db_type[6 : -1]) diff --git a/tests/test_datetime_fields.py b/tests/test_datetime_fields.py index 3387ee9..6031396 100644 --- a/tests/test_datetime_fields.py +++ b/tests/test_datetime_fields.py @@ -20,8 +20,14 @@ class DateFieldsTest(unittest.TestCase): def test_ad_hoc_model(self): self.database.insert([ - ModelWithDate(date_field='2016-08-30', datetime_field='2016-08-30 03:50:00'), - ModelWithDate(date_field='2016-08-31', datetime_field='2016-08-31 01:30:00') + ModelWithDate( + date_field='2016-08-30', + datetime_field='2016-08-30 03:50:00', + datetime64_field='2016-08-30 03:50:00.001'), + ModelWithDate( + date_field='2016-08-31', + datetime_field='2016-08-31 01:30:00', + datetime64_field='2016-08-31 01:30:00.002') ]) # toStartOfHour returns DateTime('Asia/Yekaterinburg') in my case, so I test it here to @@ -30,15 +36,17 @@ class DateFieldsTest(unittest.TestCase): self.assertEqual(len(results), 2) self.assertEqual(results[0].date_field, datetime.date(2016, 8, 30)) self.assertEqual(results[0].datetime_field, datetime.datetime(2016, 8, 30, 3, 50, 0, tzinfo=pytz.UTC)) + self.assertEqual(results[0].datetime64_field, datetime.datetime(2016, 8, 30, 3, 50, 0, 1000, tzinfo=pytz.UTC)) self.assertEqual(results[0].hour_start, datetime.datetime(2016, 8, 30, 3, 0, 0, tzinfo=pytz.UTC)) self.assertEqual(results[1].date_field, datetime.date(2016, 8, 31)) self.assertEqual(results[1].datetime_field, datetime.datetime(2016, 8, 31, 1, 30, 0, tzinfo=pytz.UTC)) + self.assertEqual(results[1].datetime64_field, datetime.datetime(2016, 8, 31, 1, 30, 0, 2000, tzinfo=pytz.UTC)) self.assertEqual(results[1].hour_start, datetime.datetime(2016, 8, 31, 1, 0, 0, tzinfo=pytz.UTC)) class ModelWithDate(Model): - date_field = DateField() datetime_field = DateTimeField() + datetime64_field = DateTime64Field() engine = MergeTree('date_field', ('date_field',)) diff --git a/tests/test_funcs.py b/tests/test_funcs.py index fa352d8..ee627c2 100644 --- a/tests/test_funcs.py +++ b/tests/test_funcs.py @@ -351,6 +351,7 @@ class FuncsTestCase(TestCaseWithData): if self.database.server_timezone != pytz.utc: raise unittest.SkipTest('This test must run with UTC as the server timezone') self._test_func(F.toDateTime('2018-12-31 11:22:33'), datetime(2018, 12, 31, 11, 22, 33, tzinfo=pytz.utc)) + self._test_func(F.toDateTime64('2018-12-31 11:22:33.001', 6), datetime(2018, 12, 31, 11, 22, 33, 1000, tzinfo=pytz.utc)) self._test_func(F.parseDateTimeBestEffort('31/12/2019 10:05AM'), datetime(2019, 12, 31, 10, 5, tzinfo=pytz.utc)) self._test_func(F.parseDateTimeBestEffortOrNull('31/12/2019 10:05AM'), datetime(2019, 12, 31, 10, 5, tzinfo=pytz.utc)) self._test_func(F.parseDateTimeBestEffortOrZero('31/12/2019 10:05AM'), datetime(2019, 12, 31, 10, 5, tzinfo=pytz.utc)) From 62ad18d8ffd4bf4ec85fa91b83719311e2666bb5 Mon Sep 17 00:00:00 2001 From: Niyaz Batyrshin Date: Sun, 7 Jun 2020 13:14:18 +0300 Subject: [PATCH 09/27] removes error reraising --- src/infi/clickhouse_orm/fields.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 6b7fee4..88eb704 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -238,10 +238,7 @@ class DateTime64Field(DateTimeField): assert precision is None or isinstance(precision, int), 'Precision must be int type' assert timezone is None or isinstance(timezone, str), 'Timezone must be string type' if timezone: - try: - pytz.timezone(timezone) - except UnknownTimeZoneError: - raise Exception('Timezone must be a valid IANA timezone identifier') + pytz.timezone(timezone) self.precision = precision self.timezone = timezone From f30cb87e6028e3b293a6b73e0449a875b4500207 Mon Sep 17 00:00:00 2001 From: Niyaz Batyrshin Date: Fri, 12 Jun 2020 11:29:47 +0300 Subject: [PATCH 10/27] * timezone support moved to base DateTimeField class, timezone accepts string value or pytz.timezone * test timezones and DateTime64 precision --- src/infi/clickhouse_orm/fields.py | 59 +++++++++++++++++++++------- src/infi/clickhouse_orm/models.py | 11 ++++-- tests/test_datetime_fields.py | 64 +++++++++++++++++++++++++++++-- tests/test_nullable_fields.py | 2 +- tests/test_simple_fields.py | 41 ++++++++++++++++---- 5 files changed, 146 insertions(+), 31 deletions(-) diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 88eb704..91ab8ac 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -1,14 +1,13 @@ from __future__ import unicode_literals import datetime -from typing import List - +from typing import List, Union import iso8601 import pytz from calendar import timegm from decimal import Decimal, localcontext from uuid import UUID from logging import getLogger -from pytz import UnknownTimeZoneError +from pytz import BaseTzInfo from .utils import escape, parse_array, comma_join, string_or_func, get_subclass_names from .funcs import F, FunctionOperatorsMixin from ipaddress import IPv4Address, IPv6Address @@ -197,9 +196,23 @@ class DateTimeField(Field): class_default = datetime.datetime.fromtimestamp(0, pytz.utc) db_type = 'DateTime' + def __init__(self, default=None, alias=None, materialized=None, readonly=None, codec=None, + timezone: Union[BaseTzInfo, str] = None): + super().__init__(default, alias, materialized, readonly, codec) + # assert not timezone, 'Temporarily field timezone is not supported' + if timezone: + timezone = timezone if isinstance(timezone, BaseTzInfo) else pytz.timezone(timezone) + self.timezone: BaseTzInfo = timezone + + def get_db_type_args(self) -> List[str]: + args = [] + if self.timezone: + args.append(escape(self.timezone.zone)) + return args + def to_python(self, value, timezone_in_use): if isinstance(value, datetime.datetime): - return value.astimezone(pytz.utc) if value.tzinfo else value.replace(tzinfo=pytz.utc) + return value if value.tzinfo else value.replace(tzinfo=pytz.utc) if isinstance(value, datetime.date): return datetime.datetime(value.year, value.month, value.day, tzinfo=pytz.utc) if isinstance(value, int): @@ -222,7 +235,7 @@ class DateTimeField(Field): # convert naive to aware if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None: dt = timezone_in_use.localize(dt) - return dt.astimezone(pytz.utc) + return dt raise ValueError('Invalid value for %s - %r' % (self.__class__.__name__, value)) def to_db_string(self, value, quote=True): @@ -233,19 +246,15 @@ class DateTime64Field(DateTimeField): db_type = 'DateTime64' def __init__(self, default=None, alias=None, materialized=None, readonly=None, codec=None, - precision: int = 6, timezone: str = None): - super().__init__(default, alias, materialized, readonly, codec) + timezone: Union[BaseTzInfo, str] = None, precision: int = 6): + super().__init__(default, alias, materialized, readonly, codec, timezone) assert precision is None or isinstance(precision, int), 'Precision must be int type' - assert timezone is None or isinstance(timezone, str), 'Timezone must be string type' - if timezone: - pytz.timezone(timezone) self.precision = precision - self.timezone = timezone def get_db_type_args(self) -> List[str]: args = [str(self.precision)] if self.timezone: - args.append(escape(self.timezone)) + args.append(escape(self.timezone.zone)) return args def to_db_string(self, value: datetime.datetime, quote=True): @@ -254,8 +263,30 @@ class DateTime64Field(DateTimeField): Returns string in 0000000000.000000 format, where remainder digits count is equal to precision """ - width = 11 + self.precision - return escape(f'{value.timestamp():0{width}.{self.precision}f}', quote) + return escape( + '{timestamp:0{width}.{precision}f}'.format( + timestamp=value.timestamp(), + width=11 + self.precision, + precision=6), + quote + ) + + def to_python(self, value, timezone_in_use): + try: + return super().to_python(value, timezone_in_use) + except ValueError: + if isinstance(value, (int, float)): + return datetime.datetime.utcfromtimestamp(value).replace(tzinfo=pytz.utc) + if isinstance(value, str): + if value.split('.')[0] == '0000-00-00 00:00:00': + return self.class_default + if len(value.split('.')[0]) == 10: + try: + value = float(value) + return datetime.datetime.utcfromtimestamp(value).replace(tzinfo=pytz.utc) + except ValueError: + pass + raise class BaseIntField(Field): diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index ee99c20..e3f95e3 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -206,8 +206,10 @@ class ModelBase(type): return orm_fields.BaseEnumField.create_ad_hoc_field(db_type) # DateTime with timezone if db_type.startswith('DateTime('): - # Some functions return DateTimeField with timezone in brackets - return orm_fields.DateTimeField() + timezone = db_type[9:-1] + return orm_fields.DateTimeField( + timezone=timezone[1:-1] if timezone else None + ) # DateTime64 if db_type.startswith('DateTime64('): precision, *timezone = [s.strip() for s in db_type[11:-1].split(',')] @@ -382,14 +384,15 @@ class Model(metaclass=ModelBase): - `line`: the TSV-formatted data. - `field_names`: names of the model fields in the data. - - `timezone_in_use`: the timezone to use when parsing dates and datetimes. + - `timezone_in_use`: the timezone to use when parsing dates and datetimes. Some fields use their own timezones. - `database`: if given, sets the database that this instance belongs to. ''' values = iter(parse_tsv(line)) kwargs = {} for name in field_names: field = getattr(cls, name) - kwargs[name] = field.to_python(next(values), timezone_in_use) + field_timezone = getattr(field, 'timezone', None) or timezone_in_use + kwargs[name] = field.to_python(next(values), field_timezone) obj = cls(**kwargs) if database is not None: diff --git a/tests/test_datetime_fields.py b/tests/test_datetime_fields.py index 6031396..c5ab846 100644 --- a/tests/test_datetime_fields.py +++ b/tests/test_datetime_fields.py @@ -23,11 +23,14 @@ class DateFieldsTest(unittest.TestCase): ModelWithDate( date_field='2016-08-30', datetime_field='2016-08-30 03:50:00', - datetime64_field='2016-08-30 03:50:00.001'), + datetime64_field='2016-08-30 03:50:00.123456', + datetime64_3_field='2016-08-30 03:50:00.123456' + ), ModelWithDate( date_field='2016-08-31', datetime_field='2016-08-31 01:30:00', - datetime64_field='2016-08-31 01:30:00.002') + datetime64_field='2016-08-31 01:30:00.123456', + datetime64_3_field='2016-08-31 01:30:00.123456') ]) # toStartOfHour returns DateTime('Asia/Yekaterinburg') in my case, so I test it here to @@ -36,17 +39,70 @@ class DateFieldsTest(unittest.TestCase): self.assertEqual(len(results), 2) self.assertEqual(results[0].date_field, datetime.date(2016, 8, 30)) self.assertEqual(results[0].datetime_field, datetime.datetime(2016, 8, 30, 3, 50, 0, tzinfo=pytz.UTC)) - self.assertEqual(results[0].datetime64_field, datetime.datetime(2016, 8, 30, 3, 50, 0, 1000, tzinfo=pytz.UTC)) self.assertEqual(results[0].hour_start, datetime.datetime(2016, 8, 30, 3, 0, 0, tzinfo=pytz.UTC)) self.assertEqual(results[1].date_field, datetime.date(2016, 8, 31)) self.assertEqual(results[1].datetime_field, datetime.datetime(2016, 8, 31, 1, 30, 0, tzinfo=pytz.UTC)) - self.assertEqual(results[1].datetime64_field, datetime.datetime(2016, 8, 31, 1, 30, 0, 2000, tzinfo=pytz.UTC)) self.assertEqual(results[1].hour_start, datetime.datetime(2016, 8, 31, 1, 0, 0, tzinfo=pytz.UTC)) + self.assertEqual(results[0].datetime64_field, datetime.datetime(2016, 8, 30, 3, 50, 0, 123456, tzinfo=pytz.UTC)) + self.assertEqual(results[0].datetime64_3_field, datetime.datetime(2016, 8, 30, 3, 50, 0, 123000, + tzinfo=pytz.UTC)) + self.assertEqual(results[1].datetime64_field, datetime.datetime(2016, 8, 31, 1, 30, 0, 123456, tzinfo=pytz.UTC)) + self.assertEqual(results[1].datetime64_3_field, datetime.datetime(2016, 8, 31, 1, 30, 0, 123000, + tzinfo=pytz.UTC)) + class ModelWithDate(Model): date_field = DateField() datetime_field = DateTimeField() datetime64_field = DateTime64Field() + datetime64_3_field = DateTime64Field(precision=3) engine = MergeTree('date_field', ('date_field',)) + + +class ModelWithTz(Model): + datetime_no_tz_field = DateTimeField() # server tz + datetime_tz_field = DateTimeField(timezone='Europe/Madrid') + datetime_utc_field = DateTimeField(timezone=pytz.UTC) + + engine = MergeTree('datetime_no_tz_field', ('datetime_no_tz_field',)) + + +class DateTimeFieldWithTzTest(unittest.TestCase): + def setUp(self): + self.database = Database('test-db', log_statements=True) + self.database.create_table(ModelWithTz) + + def tearDown(self): + self.database.drop_database() + + def test_ad_hoc_model(self): + self.database.insert([ + ModelWithTz( + datetime_no_tz_field='2020-06-11 04:00:00', + datetime_tz_field='2020-06-11 04:00:00', + datetime_utc_field='2020-06-11 04:00:00', + ), + ModelWithTz( + datetime_no_tz_field='2020-06-11 07:00:00+0300', + datetime_tz_field='2020-06-11 07:00:00+0300', + datetime_utc_field='2020-06-11 07:00:00+0300', + ), + ]) + query = 'SELECT * from $db.modelwithtz ORDER BY datetime_no_tz_field' + results = list(self.database.select(query)) + + self.assertEqual(results[0].datetime_no_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) + self.assertEqual(results[0].datetime_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) + self.assertEqual(results[0].datetime_utc_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) + self.assertEqual(results[1].datetime_no_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) + self.assertEqual(results[1].datetime_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) + self.assertEqual(results[1].datetime_utc_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) + + self.assertEqual(results[0].datetime_no_tz_field.tzinfo.zone, self.database.server_timezone.zone) + self.assertEqual(results[0].datetime_tz_field.tzinfo.zone, pytz.timezone('Europe/Madrid').zone) + self.assertEqual(results[0].datetime_utc_field.tzinfo.zone, pytz.timezone('UTC').zone) + self.assertEqual(results[1].datetime_no_tz_field.tzinfo.zone, self.database.server_timezone.zone) + self.assertEqual(results[1].datetime_tz_field.tzinfo.zone, pytz.timezone('Europe/Madrid').zone) + self.assertEqual(results[1].datetime_utc_field.tzinfo.zone, pytz.timezone('UTC').zone) diff --git a/tests/test_nullable_fields.py b/tests/test_nullable_fields.py index b395a9b..e65c2f8 100644 --- a/tests/test_nullable_fields.py +++ b/tests/test_nullable_fields.py @@ -38,7 +38,7 @@ class NullableFieldsTest(unittest.TestCase): if value == '\\N': self.assertIsNone(dt) else: - self.assertEqual(dt.tzinfo, pytz.utc) + self.assertTrue(dt.tzinfo) # Verify that conversion to and from db string does not change value dt2 = f.to_python(f.to_db_string(dt, quote=False), pytz.utc) self.assertEqual(dt, dt2) diff --git a/tests/test_simple_fields.py b/tests/test_simple_fields.py index a6cec47..54ddac7 100644 --- a/tests/test_simple_fields.py +++ b/tests/test_simple_fields.py @@ -6,18 +6,21 @@ import pytz class SimpleFieldsTest(unittest.TestCase): + epoch = datetime(1970, 1, 1, tzinfo=pytz.utc) + # Valid values + dates = [ + date(1970, 1, 1), datetime(1970, 1, 1), epoch, + epoch.astimezone(pytz.timezone('US/Eastern')), epoch.astimezone(pytz.timezone('Asia/Jerusalem')), + '1970-01-01 00:00:00', '1970-01-17 00:00:17', '0000-00-00 00:00:00', 0, + '2017-07-26T08:31:05', '2017-07-26T08:31:05Z', '2017-07-26 08:31', + '2017-07-26T13:31:05+05', '2017-07-26 13:31:05+0500' + ] def test_datetime_field(self): f = DateTimeField() - epoch = datetime(1970, 1, 1, tzinfo=pytz.utc) - # Valid values - for value in (date(1970, 1, 1), datetime(1970, 1, 1), epoch, - epoch.astimezone(pytz.timezone('US/Eastern')), epoch.astimezone(pytz.timezone('Asia/Jerusalem')), - '1970-01-01 00:00:00', '1970-01-17 00:00:17', '0000-00-00 00:00:00', 0, - '2017-07-26T08:31:05', '2017-07-26T08:31:05Z', '2017-07-26 08:31', - '2017-07-26T13:31:05+05', '2017-07-26 13:31:05+0500'): + for value in self.dates: dt = f.to_python(value, pytz.utc) - self.assertEqual(dt.tzinfo, pytz.utc) + self.assertTrue(dt.tzinfo) # Verify that conversion to and from db string does not change value dt2 = f.to_python(f.to_db_string(dt, quote=False), pytz.utc) self.assertEqual(dt, dt2) @@ -27,6 +30,28 @@ class SimpleFieldsTest(unittest.TestCase): with self.assertRaises(ValueError): f.to_python(value, pytz.utc) + def test_datetime64_field(self): + f = DateTime64Field() + epoch = datetime(1970, 1, 1, tzinfo=pytz.utc) + # Valid values + for value in self.dates + [ + datetime(1970, 1, 1, microsecond=100000), + datetime(1970, 1, 1, microsecond=100000).astimezone(pytz.timezone('US/Eastern')), + '1970-01-01 00:00:00.1', '1970-01-17 00:00:17.1', '0000-00-00 00:00:00.1', 0.1, + '2017-07-26T08:31:05.1', '2017-07-26T08:31:05.1Z', '2017-07-26 08:31.1', + '2017-07-26T13:31:05.1+05', '2017-07-26 13:31:05.1+0500' + ]: + dt = f.to_python(value, pytz.utc) + self.assertTrue(dt.tzinfo) + # Verify that conversion to and from db string does not change value + dt2 = f.to_python(f.to_db_string(dt, quote=False), pytz.utc) + self.assertEqual(dt, dt2) + # Invalid values + for value in ('nope', '21/7/1999', + '2017-01 15:06:00', '2017-01-01X15:06:00', '2017-13-01T15:06:00'): + with self.assertRaises(ValueError): + f.to_python(value, pytz.utc) + def test_date_field(self): f = DateField() epoch = date(1970, 1, 1) From 2a38c5200c9c8a00e464ae4ba5b75175909d12a9 Mon Sep 17 00:00:00 2001 From: Niyaz Batyrshin Date: Fri, 12 Jun 2020 11:36:16 +0300 Subject: [PATCH 11/27] test DateTime64 timezones --- tests/test_datetime_fields.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_datetime_fields.py b/tests/test_datetime_fields.py index c5ab846..5554094 100644 --- a/tests/test_datetime_fields.py +++ b/tests/test_datetime_fields.py @@ -64,6 +64,7 @@ class ModelWithDate(Model): class ModelWithTz(Model): datetime_no_tz_field = DateTimeField() # server tz datetime_tz_field = DateTimeField(timezone='Europe/Madrid') + datetime64_tz_field = DateTime64Field(timezone='Europe/Madrid') datetime_utc_field = DateTimeField(timezone=pytz.UTC) engine = MergeTree('datetime_no_tz_field', ('datetime_no_tz_field',)) @@ -82,11 +83,13 @@ class DateTimeFieldWithTzTest(unittest.TestCase): ModelWithTz( datetime_no_tz_field='2020-06-11 04:00:00', datetime_tz_field='2020-06-11 04:00:00', + datetime64_tz_field='2020-06-11 04:00:00', datetime_utc_field='2020-06-11 04:00:00', ), ModelWithTz( datetime_no_tz_field='2020-06-11 07:00:00+0300', datetime_tz_field='2020-06-11 07:00:00+0300', + datetime64_tz_field='2020-06-11 07:00:00+0300', datetime_utc_field='2020-06-11 07:00:00+0300', ), ]) @@ -95,14 +98,18 @@ class DateTimeFieldWithTzTest(unittest.TestCase): self.assertEqual(results[0].datetime_no_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) self.assertEqual(results[0].datetime_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) + self.assertEqual(results[0].datetime64_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) self.assertEqual(results[0].datetime_utc_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) self.assertEqual(results[1].datetime_no_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) self.assertEqual(results[1].datetime_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) + self.assertEqual(results[1].datetime64_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) self.assertEqual(results[1].datetime_utc_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC)) self.assertEqual(results[0].datetime_no_tz_field.tzinfo.zone, self.database.server_timezone.zone) self.assertEqual(results[0].datetime_tz_field.tzinfo.zone, pytz.timezone('Europe/Madrid').zone) + self.assertEqual(results[0].datetime64_tz_field.tzinfo.zone, pytz.timezone('Europe/Madrid').zone) self.assertEqual(results[0].datetime_utc_field.tzinfo.zone, pytz.timezone('UTC').zone) self.assertEqual(results[1].datetime_no_tz_field.tzinfo.zone, self.database.server_timezone.zone) self.assertEqual(results[1].datetime_tz_field.tzinfo.zone, pytz.timezone('Europe/Madrid').zone) + self.assertEqual(results[1].datetime64_tz_field.tzinfo.zone, pytz.timezone('Europe/Madrid').zone) self.assertEqual(results[1].datetime_utc_field.tzinfo.zone, pytz.timezone('UTC').zone) From 888f8dc4da4268adf5d2922bf7fdbb794235171d Mon Sep 17 00:00:00 2001 From: Niyaz Batyrshin Date: Fri, 12 Jun 2020 11:50:38 +0300 Subject: [PATCH 12/27] fix docs --- docs/class_reference.md | 2 +- docs/field_types.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/class_reference.md b/docs/class_reference.md index abc4b06..3874387 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -862,7 +862,7 @@ Extends Field Extends Field -#### DateTimeField(default=None, alias=None, materialized=None, readonly=None, codec=None) +#### DateTimeField(default=None, alias=None, materialized=None, readonly=None, codec=None, timezone=None) ### DateTime64Field diff --git a/docs/field_types.md b/docs/field_types.md index 2bfea79..d894802 100644 --- a/docs/field_types.md +++ b/docs/field_types.md @@ -10,8 +10,8 @@ The following field types are supported: | StringField | String | str | Encoded as UTF-8 when written to ClickHouse | FixedStringField | FixedString| str | Encoded as UTF-8 when written to ClickHouse | DateField | Date | datetime.date | Range 1970-01-01 to 2105-12-31 -| DateTimeField | DateTime | datetime.datetime | Minimal value is 1970-01-01 00:00:00; Always in UTC -| DateTime64Field | DateTime64 | datetime.datetime | Minimal value is 1970-01-01 00:00:00; Always in UTC +| DateTimeField | DateTime | datetime.datetime | Minimal value is 1970-01-01 00:00:00; Timezone aware +| DateTime64Field | DateTime64 | datetime.datetime | Minimal value is 1970-01-01 00:00:00; Timezone aware | Int8Field | Int8 | int | Range -128 to 127 | Int16Field | Int16 | int | Range -32768 to 32767 | Int32Field | Int32 | int | Range -2147483648 to 2147483647 From 45552fdb977c4cb6e87ac60ea362926086fc6f8a Mon Sep 17 00:00:00 2001 From: Christian Pedersen Date: Tue, 23 Jun 2020 09:35:13 +0200 Subject: [PATCH 13/27] Support FINAL for ReplacingMergeTree --- src/infi/clickhouse_orm/query.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/infi/clickhouse_orm/query.py b/src/infi/clickhouse_orm/query.py index cf30ce9..376b9ac 100644 --- a/src/infi/clickhouse_orm/query.py +++ b/src/infi/clickhouse_orm/query.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import pytz from copy import copy, deepcopy from math import ceil -from .engines import CollapsingMergeTree +from .engines import CollapsingMergeTree, ReplacingMergeTree from datetime import date, datetime from .utils import comma_join, string_or_func @@ -540,8 +540,8 @@ class QuerySet(object): Adds a FINAL modifier to table, meaning data will be collapsed to final version. Can be used with `CollapsingMergeTree` engine only. """ - if not isinstance(self._model_cls.engine, CollapsingMergeTree): - raise TypeError('final() method can be used only with CollapsingMergeTree engine') + if not isinstance(self._model_cls.engine, (CollapsingMergeTree, ReplacingMergeTree)): + raise TypeError('final() method can be used only with the CollapsingMergeTree and ReplacingMergeTree engines') qs = copy(self) qs._final = True From 7fe76c185ddca474e084cfb9946d8f8170f9e5d7 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 23 Jun 2020 11:04:42 +0300 Subject: [PATCH 14/27] DateTime64 field - additional fixes & docs --- CHANGELOG.md | 6 ++++++ docs/field_types.md | 12 +++++++++--- src/infi/clickhouse_orm/fields.py | 7 ++++--- tests/test_datetime_fields.py | 3 +++ tests/test_funcs.py | 8 +++++--- tests/test_simple_fields.py | 10 +++++++++- 6 files changed, 36 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 054bf18..8c40e2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ Unreleased ---------- - Support for model constraints - Support for data skipping indexes +- Added `DateTime64Field` (NiyazNz) +- Make `DateTimeField` and `DateTime64Field` timezone-aware (NiyazNz) + +**Backwards incompatibile changes** + +Previously, `DateTimeField` always converted its value from the database timezone to UTC. This is no longer the case: the field's value now preserves the timezone it was defined with, or if not specified - the database's global timezone. This change has no effect if your database timezone is set UTC. v2.0.1 ------ diff --git a/docs/field_types.md b/docs/field_types.md index d894802..7613276 100644 --- a/docs/field_types.md +++ b/docs/field_types.md @@ -38,16 +38,22 @@ The following field types are supported: DateTimeField and Time Zones ---------------------------- -A `DateTimeField` can be assigned values from one of the following types: +`DateTimeField` and `DateTime64Field` can accept a `timezone` parameter (either the timezone name or a `pytz` timezone instance). This timezone will be used as the column timezone in ClickHouse. If not provided, the fields will use the timezone defined in the database configuration. + +A `DateTimeField` and `DateTime64Field` can be assigned values from one of the following types: - datetime - date - integer - number of seconds since the Unix epoch +- float (DateTime64Field only) - number of seconds and microseconds since the Unix epoch - string in `YYYY-MM-DD HH:MM:SS` format or [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601)-compatible format -The assigned value always gets converted to a timezone-aware `datetime` in UTC. If the assigned value is a timezone-aware `datetime` in another timezone, it will be converted to UTC. Otherwise, the assigned value is assumed to already be in UTC. +The assigned value always gets converted to a timezone-aware `datetime` in UTC. The only exception is when the assigned value is a timezone-aware `datetime`, in which case it will not be changed. + +DateTime values that are read from the database are kept in the database-defined timezone - either the one defined for the field, or the global timezone defined in the database configuration. + +It is strongly recommended to set the server timezone to UTC and to store all datetime values in that timezone, in order to prevent confusion and subtle bugs. Conversion to a different timezone should only be performed when the value needs to be displayed. -DateTime values that are read from the database are also converted to UTC. ClickHouse formats them according to the timezone of the server, and the ORM makes the necessary conversions. This requires a ClickHouse version which is new enough to support the `timezone()` function, otherwise it is assumed to be using UTC. In any case, we recommend settings the server timezone to UTC in order to prevent confusion. Working with enum fields ------------------------ diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 91ab8ac..08ca6e6 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -267,7 +267,7 @@ class DateTime64Field(DateTimeField): '{timestamp:0{width}.{precision}f}'.format( timestamp=value.timestamp(), width=11 + self.precision, - precision=6), + precision=self.precision), quote ) @@ -278,9 +278,10 @@ class DateTime64Field(DateTimeField): if isinstance(value, (int, float)): return datetime.datetime.utcfromtimestamp(value).replace(tzinfo=pytz.utc) if isinstance(value, str): - if value.split('.')[0] == '0000-00-00 00:00:00': + left_part = value.split('.')[0] + if left_part == '0000-00-00 00:00:00': return self.class_default - if len(value.split('.')[0]) == 10: + if len(left_part) == 10: try: value = float(value) return datetime.datetime.utcfromtimestamp(value).replace(tzinfo=pytz.utc) diff --git a/tests/test_datetime_fields.py b/tests/test_datetime_fields.py index 5554094..122dd9f 100644 --- a/tests/test_datetime_fields.py +++ b/tests/test_datetime_fields.py @@ -71,8 +71,11 @@ class ModelWithTz(Model): class DateTimeFieldWithTzTest(unittest.TestCase): + def setUp(self): self.database = Database('test-db', log_statements=True) + if self.database.server_version < (20, 1, 2, 4): + raise unittest.SkipTest('ClickHouse version too old') self.database.create_table(ModelWithTz) def tearDown(self): diff --git a/tests/test_funcs.py b/tests/test_funcs.py index ee627c2..a262e83 100644 --- a/tests/test_funcs.py +++ b/tests/test_funcs.py @@ -34,6 +34,7 @@ class FuncsTestCase(TestCaseWithData): result = list(self.database.select(sql)) logging.info('\t==> %s', result[0].value if result else '') if expected_value != NO_VALUE: + print('Comparing %s to %s' % (result[0].value, expected_value)) self.assertEqual(result[0].value, expected_value) return result[0].value if result else None except ServerError as e: @@ -310,12 +311,13 @@ class FuncsTestCase(TestCaseWithData): raise unittest.SkipTest('This test must run with UTC as the server timezone') d = date(2018, 12, 31) dt = datetime(2018, 12, 31, 11, 22, 33) + athens_tz = pytz.timezone('Europe/Athens') self._test_func(F.toHour(dt), 11) self._test_func(F.toStartOfDay(dt), datetime(2018, 12, 31, 0, 0, 0, tzinfo=pytz.utc)) self._test_func(F.toTime(dt, pytz.utc), datetime(1970, 1, 2, 11, 22, 33, tzinfo=pytz.utc)) - self._test_func(F.toTime(dt, 'Europe/Athens'), datetime(1970, 1, 2, 13, 22, 33, tzinfo=pytz.utc)) - self._test_func(F.toTime(dt, pytz.timezone('Europe/Athens')), datetime(1970, 1, 2, 13, 22, 33, tzinfo=pytz.utc)) - self._test_func(F.toTimeZone(dt, 'Europe/Athens'), datetime(2018, 12, 31, 13, 22, 33, tzinfo=pytz.utc)) + self._test_func(F.toTime(dt, 'Europe/Athens'), athens_tz.localize(datetime(1970, 1, 2, 13, 22, 33))) + self._test_func(F.toTime(dt, athens_tz), athens_tz.localize(datetime(1970, 1, 2, 13, 22, 33))) + self._test_func(F.toTimeZone(dt, 'Europe/Athens'), athens_tz.localize(datetime(2018, 12, 31, 13, 22, 33))) self._test_func(F.now(), datetime.utcnow().replace(tzinfo=pytz.utc, microsecond=0)) # FIXME this may fail if the timing is just right self._test_func(F.today(), datetime.utcnow().date()) self._test_func(F.yesterday(), datetime.utcnow().date() - timedelta(days=1)) diff --git a/tests/test_simple_fields.py b/tests/test_simple_fields.py index 54ddac7..1494bda 100644 --- a/tests/test_simple_fields.py +++ b/tests/test_simple_fields.py @@ -6,6 +6,7 @@ import pytz class SimpleFieldsTest(unittest.TestCase): + epoch = datetime(1970, 1, 1, tzinfo=pytz.utc) # Valid values dates = [ @@ -32,7 +33,6 @@ class SimpleFieldsTest(unittest.TestCase): def test_datetime64_field(self): f = DateTime64Field() - epoch = datetime(1970, 1, 1, tzinfo=pytz.utc) # Valid values for value in self.dates + [ datetime(1970, 1, 1, microsecond=100000), @@ -52,6 +52,14 @@ class SimpleFieldsTest(unittest.TestCase): with self.assertRaises(ValueError): f.to_python(value, pytz.utc) + def test_datetime64_field_precision(self): + for precision in range(1, 7): + f = DateTime64Field(precision=precision, timezone=pytz.utc) + dt = f.to_python(datetime(2000, 1, 1, microsecond=123456), pytz.utc) + dt2 = f.to_python(f.to_db_string(dt, quote=False), pytz.utc) + m = round(123456, precision - 6) # round rightmost microsecond digits according to precision + self.assertEqual(dt2, dt.replace(microsecond=m)) + def test_date_field(self): f = DateField() epoch = date(1970, 1, 1) From f98dead1ab36b4bb35d4dc8e4a836600544c85a6 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 23 Jun 2020 11:26:20 +0300 Subject: [PATCH 15/27] DateTime64 field - additional fixes & docs --- tests/test_simple_fields.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_simple_fields.py b/tests/test_simple_fields.py index 1494bda..0e89ed5 100644 --- a/tests/test_simple_fields.py +++ b/tests/test_simple_fields.py @@ -36,7 +36,7 @@ class SimpleFieldsTest(unittest.TestCase): # Valid values for value in self.dates + [ datetime(1970, 1, 1, microsecond=100000), - datetime(1970, 1, 1, microsecond=100000).astimezone(pytz.timezone('US/Eastern')), + pytz.timezone('US/Eastern').localize(datetime(1970, 1, 1, microsecond=100000)), '1970-01-01 00:00:00.1', '1970-01-17 00:00:17.1', '0000-00-00 00:00:00.1', 0.1, '2017-07-26T08:31:05.1', '2017-07-26T08:31:05.1Z', '2017-07-26 08:31.1', '2017-07-26T13:31:05.1+05', '2017-07-26 13:31:05.1+0500' From c45bd07cea0d5aa6ae5f83bc7c406d89e32c606f Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 23 Jun 2020 11:26:43 +0300 Subject: [PATCH 16/27] Remove type hints - not supported in Python 3.5 --- src/infi/clickhouse_orm/fields.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 08ca6e6..07172ae 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals import datetime -from typing import List, Union import iso8601 import pytz from calendar import timegm @@ -95,7 +94,7 @@ class Field(FunctionOperatorsMixin): sql += self._extra_params(db) return sql - def get_db_type_args(self) -> List[str]: + def get_db_type_args(self): """Returns field type arguments""" return [] @@ -197,14 +196,14 @@ class DateTimeField(Field): db_type = 'DateTime' def __init__(self, default=None, alias=None, materialized=None, readonly=None, codec=None, - timezone: Union[BaseTzInfo, str] = None): + timezone=None): super().__init__(default, alias, materialized, readonly, codec) # assert not timezone, 'Temporarily field timezone is not supported' if timezone: timezone = timezone if isinstance(timezone, BaseTzInfo) else pytz.timezone(timezone) - self.timezone: BaseTzInfo = timezone + self.timezone = timezone - def get_db_type_args(self) -> List[str]: + def get_db_type_args(self): args = [] if self.timezone: args.append(escape(self.timezone.zone)) @@ -246,18 +245,18 @@ class DateTime64Field(DateTimeField): db_type = 'DateTime64' def __init__(self, default=None, alias=None, materialized=None, readonly=None, codec=None, - timezone: Union[BaseTzInfo, str] = None, precision: int = 6): + timezone=None, precision=6): super().__init__(default, alias, materialized, readonly, codec, timezone) assert precision is None or isinstance(precision, int), 'Precision must be int type' self.precision = precision - def get_db_type_args(self) -> List[str]: + def get_db_type_args(self): args = [str(self.precision)] if self.timezone: args.append(escape(self.timezone.zone)) return args - def to_db_string(self, value: datetime.datetime, quote=True): + def to_db_string(self, value, quote=True): """ Returns the field's value prepared for writing to the database From 8fea844d70bb7236578b11d671f73e4368420cbd Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 23 Jun 2020 11:32:43 +0300 Subject: [PATCH 17/27] DateTime64 field - additional fixes & docs --- tests/test_datetime_fields.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_datetime_fields.py b/tests/test_datetime_fields.py index 122dd9f..2f2300b 100644 --- a/tests/test_datetime_fields.py +++ b/tests/test_datetime_fields.py @@ -13,6 +13,8 @@ class DateFieldsTest(unittest.TestCase): def setUp(self): self.database = Database('test-db', log_statements=True) + if self.database.server_version < (20, 1, 2, 4): + raise unittest.SkipTest('ClickHouse version too old') self.database.create_table(ModelWithDate) def tearDown(self): From f35333a7b62c77769f45f34ddcf7d8720f4bbbeb Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 23 Jun 2020 11:37:41 +0300 Subject: [PATCH 18/27] Support FINAL for ReplacingMergeTree --- CHANGELOG.md | 1 + docs/class_reference.md | 35 ++++++++++++++++++++------------ docs/toc.md | 2 +- src/infi/clickhouse_orm/query.py | 4 ++-- tests/test_querysets.py | 2 +- 5 files changed, 27 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c40e2a..979edeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ Unreleased ---------- - Support for model constraints - Support for data skipping indexes +- Support FINAL for `ReplacingMergeTree` (chripede) - Added `DateTime64Field` (NiyazNz) - Make `DateTimeField` and `DateTime64Field` timezone-aware (NiyazNz) diff --git a/docs/class_reference.md b/docs/class_reference.md index 3874387..d415b13 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -203,7 +203,7 @@ The `field_names` list must match the fields defined in the model, but does not - `line`: the TSV-formatted data. - `field_names`: names of the model fields in the data. -- `timezone_in_use`: the timezone to use when parsing dates and datetimes. +- `timezone_in_use`: the timezone to use when parsing dates and datetimes. Some fields use their own timezones. - `database`: if given, sets the database that this instance belongs to. @@ -333,7 +333,7 @@ The `field_names` list must match the fields defined in the model, but does not - `line`: the TSV-formatted data. - `field_names`: names of the model fields in the data. -- `timezone_in_use`: the timezone to use when parsing dates and datetimes. +- `timezone_in_use`: the timezone to use when parsing dates and datetimes. Some fields use their own timezones. - `database`: if given, sets the database that this instance belongs to. @@ -468,7 +468,7 @@ The `field_names` list must match the fields defined in the model, but does not - `line`: the TSV-formatted data. - `field_names`: names of the model fields in the data. -- `timezone_in_use`: the timezone to use when parsing dates and datetimes. +- `timezone_in_use`: the timezone to use when parsing dates and datetimes. Some fields use their own timezones. - `database`: if given, sets the database that this instance belongs to. @@ -634,7 +634,7 @@ The `field_names` list must match the fields defined in the model, but does not - `line`: the TSV-formatted data. - `field_names`: names of the model fields in the data. -- `timezone_in_use`: the timezone to use when parsing dates and datetimes. +- `timezone_in_use`: the timezone to use when parsing dates and datetimes. Some fields use their own timezones. - `database`: if given, sets the database that this instance belongs to. @@ -858,6 +858,13 @@ Extends Field #### DateField(default=None, alias=None, materialized=None, readonly=None, codec=None) +### DateTime64Field + +Extends DateTimeField + +#### DateTime64Field(default=None, alias=None, materialized=None, readonly=None, codec=None, timezone=None, precision=6) + + ### DateTimeField Extends Field @@ -865,13 +872,6 @@ Extends Field #### DateTimeField(default=None, alias=None, materialized=None, readonly=None, codec=None, timezone=None) -### DateTime64Field - -Extends DateTimeField - -#### DateTime64Field(default=None, alias=None, materialized=None, readonly=None, codec=None, precision=6, timezone=None) - - ### Decimal128Field Extends DecimalField @@ -1217,7 +1217,7 @@ Pass `prewhere=True` to apply the conditions as PREWHERE instead of WHERE. Adds a FINAL modifier to table, meaning data will be collapsed to final version. -Can be used with `CollapsingMergeTree` engine only. +Can be used with the `CollapsingMergeTree` and `ReplacingMergeTree` engines only. #### limit_by(offset_limit, *fields_or_expr) @@ -1340,7 +1340,7 @@ Pass `prewhere=True` to apply the conditions as PREWHERE instead of WHERE. Adds a FINAL modifier to table, meaning data will be collapsed to final version. -Can be used with `CollapsingMergeTree` engine only. +Can be used with the `CollapsingMergeTree` and `ReplacingMergeTree` engines only. #### group_by(*args) @@ -2744,6 +2744,15 @@ Initializer. #### toDateTime(**kwargs) +#### toDateTime64(**kwargs) + + +#### toDateTime64OrNull(precision, timezone=NO_VALUE) + + +#### toDateTime64OrZero(precision, timezone=NO_VALUE) + + #### toDateTimeOrNull() diff --git a/docs/toc.md b/docs/toc.md index 203e59b..a0f7e5e 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -93,8 +93,8 @@ * [BaseFloatField](class_reference.md#basefloatfield) * [BaseIntField](class_reference.md#baseintfield) * [DateField](class_reference.md#datefield) - * [DateTimeField](class_reference.md#datetimefield) * [DateTime64Field](class_reference.md#datetime64field) + * [DateTimeField](class_reference.md#datetimefield) * [Decimal128Field](class_reference.md#decimal128field) * [Decimal32Field](class_reference.md#decimal32field) * [Decimal64Field](class_reference.md#decimal64field) diff --git a/src/infi/clickhouse_orm/query.py b/src/infi/clickhouse_orm/query.py index 376b9ac..19cd9f4 100644 --- a/src/infi/clickhouse_orm/query.py +++ b/src/infi/clickhouse_orm/query.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import pytz from copy import copy, deepcopy from math import ceil -from .engines import CollapsingMergeTree, ReplacingMergeTree from datetime import date, datetime from .utils import comma_join, string_or_func @@ -538,8 +537,9 @@ class QuerySet(object): def final(self): """ Adds a FINAL modifier to table, meaning data will be collapsed to final version. - Can be used with `CollapsingMergeTree` engine only. + Can be used with the `CollapsingMergeTree` and `ReplacingMergeTree` engines only. """ + from .engines import CollapsingMergeTree, ReplacingMergeTree if not isinstance(self._model_cls.engine, (CollapsingMergeTree, ReplacingMergeTree)): raise TypeError('final() method can be used only with the CollapsingMergeTree and ReplacingMergeTree engines') diff --git a/tests/test_querysets.py b/tests/test_querysets.py index cec4616..c2e2b30 100644 --- a/tests/test_querysets.py +++ b/tests/test_querysets.py @@ -287,7 +287,7 @@ class QuerySetTestCase(TestCaseWithData): self._test_qs(qs[80:], 20) def test_final(self): - # Final can be used with CollapsingMergeTree engine only + # Final can be used with CollapsingMergeTree/ReplacingMergeTree engines only with self.assertRaises(TypeError): Person.objects_in(self.database).final() From 633c7ee1e97285b1c7d4cba52ab44d04ddf1ef00 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 23 Jun 2020 11:43:37 +0300 Subject: [PATCH 19/27] Refactor get_sql in enum fields --- src/infi/clickhouse_orm/fields.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 07172ae..b63997e 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -89,7 +89,7 @@ class Field(FunctionOperatorsMixin): sql = self.db_type args = self.get_db_type_args() if args: - sql += '(%s)' % ', '.join(args) + sql += '(%s)' % comma_join(args) if with_default_expression: sql += self._extra_params(db) return sql @@ -480,15 +480,8 @@ class BaseEnumField(Field): def to_db_string(self, value, quote=True): return escape(value.name, quote) - def get_sql(self, with_default_expression=True, db=None): - values = ['%s = %d' % (escape(item.name), item.value) for item in self.enum_cls] - sql = '%s(%s)' % (self.db_type, ' ,'.join(values)) - if with_default_expression: - default = self.to_db_string(self.default) - sql = '%s DEFAULT %s' % (sql, default) - if self.codec and db and db.has_codec_support: - sql+= ' CODEC(%s)' % self.codec - return sql + def get_db_type_args(self): + return ['%s = %d' % (escape(item.name), item.value) for item in self.enum_cls] @classmethod def create_ad_hoc_field(cls, db_type): From 40a1e21348cffcd0b811c4dd395d35fe10154dd4 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Fri, 26 Jun 2020 17:53:39 +0300 Subject: [PATCH 20/27] Added usage examples --- README.md | 2 + examples/cpu_usage/.gitignore | 1 + examples/cpu_usage/README.md | 22 +++++ examples/cpu_usage/collect.py | 20 +++++ examples/cpu_usage/models.py | 11 +++ examples/cpu_usage/requirements.txt | 2 + examples/cpu_usage/results.py | 13 +++ examples/full_text_search/.gitignore | 2 + examples/full_text_search/README.md | 80 +++++++++++++++++ examples/full_text_search/download_ebooks.py | 27 ++++++ examples/full_text_search/load.py | 61 +++++++++++++ examples/full_text_search/models.py | 16 ++++ examples/full_text_search/requirements.txt | 4 + examples/full_text_search/search.py | 90 ++++++++++++++++++++ 14 files changed, 351 insertions(+) create mode 100644 examples/cpu_usage/.gitignore create mode 100644 examples/cpu_usage/README.md create mode 100644 examples/cpu_usage/collect.py create mode 100644 examples/cpu_usage/models.py create mode 100644 examples/cpu_usage/requirements.txt create mode 100644 examples/cpu_usage/results.py create mode 100644 examples/full_text_search/.gitignore create mode 100644 examples/full_text_search/README.md create mode 100644 examples/full_text_search/download_ebooks.py create mode 100644 examples/full_text_search/load.py create mode 100644 examples/full_text_search/models.py create mode 100644 examples/full_text_search/requirements.txt create mode 100644 examples/full_text_search/search.py diff --git a/README.md b/README.md index 8e300bd..462c8f4 100644 --- a/README.md +++ b/README.md @@ -52,4 +52,6 @@ for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percen print('CPU {row.cpu_id}: {row.average:.2f}%'.format(row=row)) ``` +This and other examples can be found in the `examples` folder. + To learn more please visit the [documentation](docs/toc.md). diff --git a/examples/cpu_usage/.gitignore b/examples/cpu_usage/.gitignore new file mode 100644 index 0000000..47def24 --- /dev/null +++ b/examples/cpu_usage/.gitignore @@ -0,0 +1 @@ +/env/ diff --git a/examples/cpu_usage/README.md b/examples/cpu_usage/README.md new file mode 100644 index 0000000..c72ef52 --- /dev/null +++ b/examples/cpu_usage/README.md @@ -0,0 +1,22 @@ +# CPU Usage + +This basic example uses `psutil` to collect a simple time-series of per-CPU usage percent. It then prints out some aggregate statistics based on the collected data. + +## Running the code + +Create a virtualenv and install the required libraries: +``` +virtualenv -p python3.6 env +source env/bin/activate +pip install -r requirements.txt +``` + +Run the `collect` script to populate the database with the CPU statistics. Let it run for a bit before pressing CTRL+C. +``` +python collect.py +``` + +Run the `results` script to display the CPU statistics: +``` +python results.py +``` diff --git a/examples/cpu_usage/collect.py b/examples/cpu_usage/collect.py new file mode 100644 index 0000000..34ee5b4 --- /dev/null +++ b/examples/cpu_usage/collect.py @@ -0,0 +1,20 @@ +import psutil, time, datetime +from infi.clickhouse_orm import Database +from models import CPUStats + + +db = Database('demo') +db.create_table(CPUStats) + + +psutil.cpu_percent(percpu=True) # first sample should be discarded + +while True: + time.sleep(1) + stats = psutil.cpu_percent(percpu=True) + timestamp = datetime.datetime.now() + print(timestamp) + db.insert([ + CPUStats(timestamp=timestamp, cpu_id=cpu_id, cpu_percent=cpu_percent) + for cpu_id, cpu_percent in enumerate(stats) + ]) diff --git a/examples/cpu_usage/models.py b/examples/cpu_usage/models.py new file mode 100644 index 0000000..c19007a --- /dev/null +++ b/examples/cpu_usage/models.py @@ -0,0 +1,11 @@ +from infi.clickhouse_orm import Model, DateTimeField, UInt16Field, Float32Field, Memory + + +class CPUStats(Model): + + timestamp = DateTimeField() + cpu_id = UInt16Field() + cpu_percent = Float32Field() + + engine = Memory() + diff --git a/examples/cpu_usage/requirements.txt b/examples/cpu_usage/requirements.txt new file mode 100644 index 0000000..5e08b8f --- /dev/null +++ b/examples/cpu_usage/requirements.txt @@ -0,0 +1,2 @@ +infi.clickhouse_orm +psutil diff --git a/examples/cpu_usage/results.py b/examples/cpu_usage/results.py new file mode 100644 index 0000000..80b892f --- /dev/null +++ b/examples/cpu_usage/results.py @@ -0,0 +1,13 @@ +from infi.clickhouse_orm import Database, F +from models import CPUStats + + +db = Database('demo') +queryset = CPUStats.objects_in(db) +total = queryset.filter(CPUStats.cpu_id == 1).count() +busy = queryset.filter(CPUStats.cpu_id == 1, CPUStats.cpu_percent > 95).count() +print('CPU 1 was busy {:.2f}% of the time'.format(busy * 100.0 / total)) + +# Calculate the average usage per CPU +for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percent)): + print('CPU {row.cpu_id}: {row.average:.2f}%'.format(row=row)) diff --git a/examples/full_text_search/.gitignore b/examples/full_text_search/.gitignore new file mode 100644 index 0000000..f6a740e --- /dev/null +++ b/examples/full_text_search/.gitignore @@ -0,0 +1,2 @@ +/ebooks/ +/env/ diff --git a/examples/full_text_search/README.md b/examples/full_text_search/README.md new file mode 100644 index 0000000..09a31ef --- /dev/null +++ b/examples/full_text_search/README.md @@ -0,0 +1,80 @@ +# Full Text Search + +This example shows how ClickHouse might be used for searching for word sequences in texts. It's a nice proof of concept, but for production use there are probably better solutions, such as Elasticsearch. + +## Running the code + +Create a virtualenv and install the required libraries: +``` +virtualenv -p python3.6 env +source env/bin/activate +pip install -r requirements.txt +``` +Run the `download_ebooks` script to download a dozen classical books from [The Gutenberg Project](http://www.gutenberg.org/): +``` +python download_ebooks.py +``` +Run the `load` script to populate the database with the downloaded texts: +``` +python load.py +``` +And finally, run the full text search: +``` + python search.py "cheshire cat" + ``` +Asterisks can be used as wildcards (each asterisk stands for one word): +``` + python search.py "much * than" + ``` + +## How it works + +The `models.py` file defines an ORM model for storing each word in the indexed texts: +```python +class Fragment(Model): + + language = LowCardinalityField(StringField(default='EN')) + document = LowCardinalityField(StringField()) + idx = UInt64Field() + word = StringField() + stem = StringField() + + # An index for faster search by document and fragment idx + index = Index((document, idx), type=Index.minmax(), granularity=1) + + # The primary key allows efficient lookup of stems + engine = MergeTree(order_by=(stem, document, idx), partition_key=('language',)) +``` +The `document` (name) and `idx` (running number of the word inside the document) fields identify the specific word. The `word` field stores the original word as it appears in the text, while the `stem` contains the word after normalization, and that's the field which is used for matching the search terms. Stemming the words makes the matching less strict, so that searching for "swallowed" will also find documents that mention "swallow" or "swallowing". + +Here's what some records in the fragment table might look like: + +| language | document | idx | word | stem | +|----------|-------------------------|------|------------------|---------------| +| EN | Moby Dick; or The Whale | 4510 | whenever | whenev | +| EN | Moby Dick; or The Whale | 4511 | it | it | +| EN | Moby Dick; or The Whale | 4512 | is | is | +| EN | Moby Dick; or The Whale | 4513 | a | a | +| EN | Moby Dick; or The Whale | 4514 | damp, | damp | +| EN | Moby Dick; or The Whale | 4515 | drizzly | drizzli | +| EN | Moby Dick; or The Whale | 4516 | November | novemb | +| EN | Moby Dick; or The Whale | 4517 | in | in | +| EN | Moby Dick; or The Whale | 4518 | my | my | +| EN | Moby Dick; or The Whale | 4519 | soul; | soul | + +Let's say we're looking for the terms "drizzly November". Finding the first in the sequence (after stemming it) is fast and easy: +```python +query = Fragment.objects_in(db).filter(stem='drizzli').only(Fragment.document, Fragment.idx) +``` +We're interested only in the `document` and `idx` fields, since they identify a specific word. + +To find the next word in the search terms, we need a subquery similar to the first one, with an additional condition that its index will be one greater than the index of the first word: +```python +subquery = Fragment.objects_in(db).filter(stem='novemb').only(Fragment.document, Fragment.idx) +query = query.filter(F.isIn((Fragment.document, Fragment.idx + 1), subquery)) +``` +And so on, by adding another subquery for each additional search term we can construct the whole sequence of words. + +As for wildcard support, when encountering a wildcard in the search terms we simply skip it - it does not need a subquery (since it can match any word). It only increases the index count so that the query conditions will "skip" one word in the sequence. + +The algorithm for building this compound query can be found in the `build_query` function. diff --git a/examples/full_text_search/download_ebooks.py b/examples/full_text_search/download_ebooks.py new file mode 100644 index 0000000..170d5e1 --- /dev/null +++ b/examples/full_text_search/download_ebooks.py @@ -0,0 +1,27 @@ +import requests +import os + + +def download_ebook(id): + print(id, end=' ') + # Download the ebook's text + r = requests.get('https://www.gutenberg.org/files/{id}/{id}-0.txt'.format(id=id)) + if r.status_code == 404: + print('NOT FOUND, SKIPPING') + return + r.raise_for_status() + # Find the ebook's title + text = r.content.decode('utf-8') + for line in text.splitlines(): + if line.startswith('Title:'): + title = line[6:].strip() + print(title) + # Save the ebook + with open('ebooks/{}.txt'.format(title), 'wb') as f: + f.write(r.content) + + +if __name__ == "__main__": + os.makedirs('ebooks', exist_ok=True) + for i in [1342, 11, 84, 2701, 25525, 1661, 98, 74, 43, 215, 1400, 76]: + download_ebook(i) diff --git a/examples/full_text_search/load.py b/examples/full_text_search/load.py new file mode 100644 index 0000000..7cf43b0 --- /dev/null +++ b/examples/full_text_search/load.py @@ -0,0 +1,61 @@ +import sys +import nltk +from nltk.stem.porter import PorterStemmer +from glob import glob +from infi.clickhouse_orm import Database +from models import Fragment + + +def trim_punctuation(word): + ''' + Trim punctuation characters from the beginning and end of the word + ''' + start = end = len(word) + for i in range(len(word)): + if word[i].isalnum(): + start = min(start, i) + end = i + 1 + return word[start : end] + + +def parse_file(filename): + ''' + Parses a text file at the give path. + Returns a generator of tuples (original_word, stemmed_word) + The original_word may include punctuation characters. + ''' + stemmer = PorterStemmer() + with open(filename, 'r', encoding='utf-8') as f: + for line in f: + for word in line.split(): + yield (word, stemmer.stem(trim_punctuation(word))) + + +def get_fragments(filename): + ''' + Converts a text file at the given path to a generator + of Fragment instances. + ''' + from os import path + document = path.splitext(path.basename(filename))[0] + idx = 0 + for word, stem in parse_file(filename): + idx += 1 + yield Fragment(document=document, idx=idx, word=word, stem=stem) + print('{} - {} words'.format(filename, idx)) + + +if __name__ == '__main__': + + # Load NLTK data if necessary + nltk.download('punkt') + nltk.download('wordnet') + + # Initialize database + db = Database('default') + db.create_table(Fragment) + + # Load files from the command line or everything under ebooks/ + filenames = sys.argv[1:] or glob('ebooks/*.txt') + for filename in filenames: + db.insert(get_fragments(filename), batch_size=100000) diff --git a/examples/full_text_search/models.py b/examples/full_text_search/models.py new file mode 100644 index 0000000..130fe83 --- /dev/null +++ b/examples/full_text_search/models.py @@ -0,0 +1,16 @@ +from infi.clickhouse_orm import * + + +class Fragment(Model): + + language = LowCardinalityField(StringField(), default='EN') + document = LowCardinalityField(StringField()) + idx = UInt64Field() + word = StringField() + stem = StringField() + + # An index for faster search by document and fragment idx + index = Index((document, idx), type=Index.minmax(), granularity=1) + + # The primary key allows efficient lookup of stems + engine = MergeTree(order_by=(stem, document, idx), partition_key=('language',)) diff --git a/examples/full_text_search/requirements.txt b/examples/full_text_search/requirements.txt new file mode 100644 index 0000000..6d2f877 --- /dev/null +++ b/examples/full_text_search/requirements.txt @@ -0,0 +1,4 @@ +infi.clickhouse_orm +nltk +requests +colorama diff --git a/examples/full_text_search/search.py b/examples/full_text_search/search.py new file mode 100644 index 0000000..ff5fcea --- /dev/null +++ b/examples/full_text_search/search.py @@ -0,0 +1,90 @@ +import sys +from colorama import init, Fore, Back, Style +from nltk.stem.porter import PorterStemmer +from infi.clickhouse_orm import Database, F +from models import Fragment +from load import trim_punctuation + + +# The wildcard character +WILDCARD = '*' + + +def prepare_search_terms(text): + ''' + Convert the text to search into a list of stemmed words. + ''' + stemmer = PorterStemmer() + stems = [] + for word in text.split(): + if word == WILDCARD: + stems.append(WILDCARD) + else: + stems.append(stemmer.stem(trim_punctuation(word))) + return stems + + +def build_query(db, stems): + ''' + Returns a queryset instance for finding sequences of Fragment instances + that matche the list of stemmed words. + ''' + # Start by searching for the first stemmed word + all_fragments = Fragment.objects_in(db) + query = all_fragments.filter(stem=stems[0]).only(Fragment.document, Fragment.idx) + # Add the following words to the queryset + for i, stem in enumerate(stems): + # Skip the first word (it's already in the query), and wildcards + if i == 0 or stem == WILDCARD: + continue + # Create a subquery that finds instances of the i'th word + subquery = all_fragments.filter(stem=stem).only(Fragment.document, Fragment.idx) + # Add it to the query, requiring that it will appear i places away from the first word + query = query.filter(F.isIn((Fragment.document, Fragment.idx + i), subquery)) + # Sort the results + query = query.order_by(Fragment.document, Fragment.idx) + return query + + +def get_matching_text(db, document, from_idx, to_idx, extra=5): + ''' + Reconstructs the document text between the given indexes (inclusive), + plus `extra` words before and after the match. The words that are + included in the given range are highlighted in green. + ''' + text = [] + conds = (Fragment.document == document) & (Fragment.idx >= from_idx - extra) & (Fragment.idx <= to_idx + extra) + for fragment in Fragment.objects_in(db).filter(conds).order_by('document', 'idx'): + word = fragment.word + if fragment.idx == from_idx: + word = Fore.GREEN + word + if fragment.idx == to_idx: + word = word + Style.RESET_ALL + text.append(word) + return ' '.join(text) + + +def find(db, text): + ''' + Performs the search for the given text, and prints out the matches. + ''' + stems = prepare_search_terms(text) + query = build_query(db, stems) + print('\n' + Fore.MAGENTA + str(query) + Style.RESET_ALL + '\n') + for match in query: + text = get_matching_text(db, match.document, match.idx, match.idx + len(stems) - 1) + print(Fore.CYAN + match.document + ':' + Style.RESET_ALL, text) + + +if __name__ == '__main__': + + # Initialize colored output + init() + + # Initialize database + db = Database('default') + + # Search + text = ' '.join(sys.argv[1:]) + if text: + find(db, text) From 436c296609a484bf004199d3e52115f6a80ca46e Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sat, 27 Jun 2020 00:02:11 +0300 Subject: [PATCH 21/27] Support for mutations: `QuerySet.update` and `QuerySet.delete` --- CHANGELOG.md | 1 + docs/class_reference.md | 30 +++++++++++ docs/querysets.md | 19 ++++++- docs/toc.md | 1 + src/infi/clickhouse_orm/query.py | 39 +++++++++++++- tests/base_test_with_data.py | 5 ++ tests/test_mutations.py | 88 ++++++++++++++++++++++++++++++++ 7 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 tests/test_mutations.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 979edeb..90f49fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ Unreleased ---------- - Support for model constraints - Support for data skipping indexes +- Support for mutations: `QuerySet.update` and `QuerySet.delete` - Support FINAL for `ReplacingMergeTree` (chripede) - Added `DateTime64Field` (NiyazNz) - Make `DateTimeField` and `DateTime64Field` timezone-aware (NiyazNz) diff --git a/docs/class_reference.md b/docs/class_reference.md index d415b13..cb718c5 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -1192,6 +1192,13 @@ Returns the contents of the query's `WHERE` or `PREWHERE` clause as a string. Returns the number of matching model instances. +#### delete() + + +Deletes all records matched by this queryset's conditions. +Note that ClickHouse performs deletions in the background, so they are not immediate. + + #### distinct() @@ -1268,6 +1275,14 @@ The result is a namedtuple containing `objects` (list), `number_of_objects`, Returns the selected fields or expressions as a SQL string. +#### update(**kwargs) + + +Updates all records matched by this queryset's conditions. +Keyword arguments specify the field names and expressions to use for the update. +Note that ClickHouse performs updates in the background, so they are not immediate. + + ### AggregateQuerySet Extends QuerySet @@ -1315,6 +1330,13 @@ Returns the contents of the query's `WHERE` or `PREWHERE` clause as a string. Returns the number of rows after aggregation. +#### delete() + + +Deletes all records matched by this queryset's conditions. +Note that ClickHouse performs deletions in the background, so they are not immediate. + + #### distinct() @@ -1397,6 +1419,14 @@ The result is a namedtuple containing `objects` (list), `number_of_objects`, Returns the selected fields or expressions as a SQL string. +#### update(**kwargs) + + +Updates all records matched by this queryset's conditions. +Keyword arguments specify the field names and expressions to use for the update. +Note that ClickHouse performs updates in the background, so they are not immediate. + + #### with_totals() diff --git a/docs/querysets.md b/docs/querysets.md index d85ca06..76ecb0e 100644 --- a/docs/querysets.md +++ b/docs/querysets.md @@ -151,7 +151,7 @@ Adds a DISTINCT clause to the query, meaning that any duplicate rows in the resu 94 Final --------- +----- This method can be used only with `CollapsingMergeTree` engine. Adds a FINAL modifier to the query, meaning that the selected data is fully "collapsed" by the engine's sign field. @@ -203,6 +203,23 @@ The `paginate` method returns a `namedtuple` containing the following fields: Note that you should use `QuerySet.order_by` so that the ordering is unique, otherwise there might be inconsistencies in the pagination (such as an instance that appears on two different pages). +Mutations +--------- + +To delete all records that match a queryset's conditions use the `delete` method: + + Person.objects_in(database).filter(first_name='Max').delete() + +To update records that match a queryset's conditions call the `update` method and provide the field names to update and the expressions to use (as keyword arguments): + + Person.objects_in(database).filter(first_name='Max').update(first_name='Maximilian') + +Note a few caveats: + +- ClickHouse cannot update columns that are used in the calculation of the primary or the partition key. +- Mutations happen in the background, so they are not immediate. +- Only tables in the `MergeTree` family support mutations. + Aggregation ----------- diff --git a/docs/toc.md b/docs/toc.md index a0f7e5e..2fd878f 100644 --- a/docs/toc.md +++ b/docs/toc.md @@ -32,6 +32,7 @@ * [Final](querysets.md#final) * [Slicing](querysets.md#slicing) * [Pagination](querysets.md#pagination) + * [Mutations](querysets.md#mutations) * [Aggregation](querysets.md#aggregation) * [Adding totals](querysets.md#adding-totals) diff --git a/src/infi/clickhouse_orm/query.py b/src/infi/clickhouse_orm/query.py index 19cd9f4..92efec4 100644 --- a/src/infi/clickhouse_orm/query.py +++ b/src/infi/clickhouse_orm/query.py @@ -4,7 +4,7 @@ import pytz from copy import copy, deepcopy from math import ceil from datetime import date, datetime -from .utils import comma_join, string_or_func +from .utils import comma_join, string_or_func, arg_to_sql # TODO @@ -547,6 +547,40 @@ class QuerySet(object): qs._final = True return qs + def delete(self): + """ + Deletes all records matched by this queryset's conditions. + Note that ClickHouse performs deletions in the background, so they are not immediate. + """ + self._verify_mutation_allowed() + conditions = (self._where_q & self._prewhere_q).to_sql(self._model_cls) + sql = 'ALTER TABLE $db.`%s` DELETE WHERE %s' % (self._model_cls.table_name(), conditions) + self._database.raw(sql) + return self + + def update(self, **kwargs): + """ + Updates all records matched by this queryset's conditions. + Keyword arguments specify the field names and expressions to use for the update. + Note that ClickHouse performs updates in the background, so they are not immediate. + """ + assert kwargs, 'No fields specified for update' + self._verify_mutation_allowed() + fields = comma_join('`%s` = %s' % (name, arg_to_sql(expr)) for name, expr in kwargs.items()) + conditions = (self._where_q & self._prewhere_q).to_sql(self._model_cls) + sql = 'ALTER TABLE $db.`%s` UPDATE %s WHERE %s' % (self._model_cls.table_name(), fields, conditions) + self._database.raw(sql) + return self + + def _verify_mutation_allowed(self): + ''' + Checks that the queryset's state allows mutations. Raises an AssertionError if not. + ''' + assert not self._limits, 'Mutations are not allowed after slicing the queryset' + assert not self._limit_by, 'Mutations are not allowed after calling limit_by(...)' + assert not self._distinct, 'Mutations are not allowed after calling distinct()' + assert not self._final, 'Mutations are not allowed after calling final()' + def aggregate(self, *args, **kwargs): """ Returns an `AggregateQuerySet` over this query, with `args` serving as @@ -647,6 +681,9 @@ class AggregateQuerySet(QuerySet): qs._grouping_with_totals = True return qs + def _verify_mutation_allowed(self): + raise AssertionError('Cannot mutate an AggregateQuerySet') + # Expose only relevant classes in import * __all__ = [c.__name__ for c in [Q, QuerySet, AggregateQuerySet]] diff --git a/tests/base_test_with_data.py b/tests/base_test_with_data.py index 8cbea48..66d6a25 100644 --- a/tests/base_test_with_data.py +++ b/tests/base_test_with_data.py @@ -21,6 +21,10 @@ class TestCaseWithData(unittest.TestCase): self.database.drop_table(Person) self.database.drop_database() + def _insert_all(self): + self.database.insert(self._sample_data()) + self.assertTrue(self.database.count(Person)) + def _insert_and_check(self, data, count, batch_size=1000): self.database.insert(data, batch_size=batch_size) self.assertEqual(count, self.database.count(Person)) @@ -32,6 +36,7 @@ class TestCaseWithData(unittest.TestCase): yield Person(**entry) + class Person(Model): first_name = StringField() diff --git a/tests/test_mutations.py b/tests/test_mutations.py new file mode 100644 index 0000000..d677ae1 --- /dev/null +++ b/tests/test_mutations.py @@ -0,0 +1,88 @@ +from infi.clickhouse_orm import F +from .base_test_with_data import * +from time import sleep + + +class MutationsTestCase(TestCaseWithData): + + def _wait_for_mutations(self): + sql = 'SELECT * FROM system.mutations WHERE is_done = 0' + while list(self.database.raw(sql)): + sleep(0.25) + + def test_delete_all(self): + self._insert_all() + Person.objects_in(self.database).delete() + self._wait_for_mutations() + self.assertFalse(Person.objects_in(self.database)) + + def test_delete_with_where_cond(self): + self._insert_all() + cond = Person.first_name == 'Cassady' + self.assertTrue(Person.objects_in(self.database).filter(cond)) + Person.objects_in(self.database).filter(cond).delete() + self._wait_for_mutations() + self.assertFalse(Person.objects_in(self.database).filter(cond)) + self.assertTrue(Person.objects_in(self.database).exclude(cond)) + + def test_delete_with_prewhere_cond(self): + self._insert_all() + cond = F.toYear(Person.birthday) == 1977 + self.assertTrue(Person.objects_in(self.database).filter(cond)) + Person.objects_in(self.database).filter(cond, prewhere=True).delete() + self._wait_for_mutations() + self.assertFalse(Person.objects_in(self.database).filter(cond)) + self.assertTrue(Person.objects_in(self.database).exclude(cond)) + + def test_update_all(self): + self._insert_all() + Person.objects_in(self.database).update(height=0) + self._wait_for_mutations() + for p in Person.objects_in(self.database): print(p.height) + self.assertFalse(Person.objects_in(self.database).exclude(height=0)) + + def test_update_with_where_cond(self): + self._insert_all() + cond = Person.first_name == 'Cassady' + Person.objects_in(self.database).filter(cond).update(height=0) + self._wait_for_mutations() + self.assertFalse(Person.objects_in(self.database).filter(cond).exclude(height=0)) + + def test_update_with_prewhere_cond(self): + self._insert_all() + cond = F.toYear(Person.birthday) == 1977 + Person.objects_in(self.database).filter(cond, prewhere=True).update(height=0) + self._wait_for_mutations() + self.assertFalse(Person.objects_in(self.database).filter(cond).exclude(height=0)) + + def test_update_multiple_fields(self): + self._insert_all() + Person.objects_in(self.database).update(height=0, passport=None) + self._wait_for_mutations() + self.assertFalse(Person.objects_in(self.database).exclude(height=0)) + self.assertFalse(Person.objects_in(self.database).exclude(passport=None)) + + def test_chained_update(self): + self._insert_all() + Person.objects_in(self.database).update(height=F.rand()).update(passport=99999) + self._wait_for_mutations() + self.assertFalse(Person.objects_in(self.database).exclude(passport=99999)) + + def test_invalid_state_for_mutations(self): + base_query = Person.objects_in(self.database) + queries = [ + base_query[0:1], + base_query.limit_by(5, 'first_name'), + base_query.distinct(), + base_query.aggregate('first_name', count=F.count()) + ] + for query in queries: + print(query) + with self.assertRaises(AssertionError): + query.delete() + with self.assertRaises(AssertionError): + query.update(height=1.8) + + def test_missing_fields_for_update(self): + with self.assertRaises(AssertionError): + Person.objects_in(self.database).update() From bc900c2ef1e372254b66733e770bd46db50386a8 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Sat, 27 Jun 2020 00:26:21 +0300 Subject: [PATCH 22/27] Skip mutations test on old ClickHouse versions --- tests/test_mutations.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/test_mutations.py b/tests/test_mutations.py index d677ae1..874c7bd 100644 --- a/tests/test_mutations.py +++ b/tests/test_mutations.py @@ -1,3 +1,4 @@ +import unittest from infi.clickhouse_orm import F from .base_test_with_data import * from time import sleep @@ -5,19 +6,23 @@ from time import sleep class MutationsTestCase(TestCaseWithData): + def setUp(self): + super().setUp() + if self.database.server_version < (18,): + raise unittest.SkipTest('ClickHouse version too old') + self._insert_all() + def _wait_for_mutations(self): sql = 'SELECT * FROM system.mutations WHERE is_done = 0' while list(self.database.raw(sql)): sleep(0.25) def test_delete_all(self): - self._insert_all() Person.objects_in(self.database).delete() self._wait_for_mutations() self.assertFalse(Person.objects_in(self.database)) def test_delete_with_where_cond(self): - self._insert_all() cond = Person.first_name == 'Cassady' self.assertTrue(Person.objects_in(self.database).filter(cond)) Person.objects_in(self.database).filter(cond).delete() @@ -26,7 +31,6 @@ class MutationsTestCase(TestCaseWithData): self.assertTrue(Person.objects_in(self.database).exclude(cond)) def test_delete_with_prewhere_cond(self): - self._insert_all() cond = F.toYear(Person.birthday) == 1977 self.assertTrue(Person.objects_in(self.database).filter(cond)) Person.objects_in(self.database).filter(cond, prewhere=True).delete() @@ -35,35 +39,30 @@ class MutationsTestCase(TestCaseWithData): self.assertTrue(Person.objects_in(self.database).exclude(cond)) def test_update_all(self): - self._insert_all() Person.objects_in(self.database).update(height=0) self._wait_for_mutations() for p in Person.objects_in(self.database): print(p.height) self.assertFalse(Person.objects_in(self.database).exclude(height=0)) def test_update_with_where_cond(self): - self._insert_all() cond = Person.first_name == 'Cassady' Person.objects_in(self.database).filter(cond).update(height=0) self._wait_for_mutations() self.assertFalse(Person.objects_in(self.database).filter(cond).exclude(height=0)) def test_update_with_prewhere_cond(self): - self._insert_all() cond = F.toYear(Person.birthday) == 1977 Person.objects_in(self.database).filter(cond, prewhere=True).update(height=0) self._wait_for_mutations() self.assertFalse(Person.objects_in(self.database).filter(cond).exclude(height=0)) def test_update_multiple_fields(self): - self._insert_all() Person.objects_in(self.database).update(height=0, passport=None) self._wait_for_mutations() self.assertFalse(Person.objects_in(self.database).exclude(height=0)) self.assertFalse(Person.objects_in(self.database).exclude(passport=None)) def test_chained_update(self): - self._insert_all() Person.objects_in(self.database).update(height=F.rand()).update(passport=99999) self._wait_for_mutations() self.assertFalse(Person.objects_in(self.database).exclude(passport=99999)) From c0bdbb76642594642cc3dfb234ebd8a776204531 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Wed, 8 Jul 2020 08:14:40 +0300 Subject: [PATCH 23/27] Added usage examples --- examples/db_explorer/.gitignore | 1 + examples/db_explorer/charts.py | 38 ++++++++++ examples/db_explorer/requirements.txt | 15 ++++ examples/db_explorer/server.py | 63 ++++++++++++++++ examples/db_explorer/templates/base.html | 22 ++++++ examples/db_explorer/templates/database.html | 54 +++++++++++++ examples/db_explorer/templates/homepage.html | 41 ++++++++++ examples/db_explorer/templates/table.html | 79 ++++++++++++++++++++ 8 files changed, 313 insertions(+) create mode 100644 examples/db_explorer/.gitignore create mode 100644 examples/db_explorer/charts.py create mode 100644 examples/db_explorer/requirements.txt create mode 100644 examples/db_explorer/server.py create mode 100644 examples/db_explorer/templates/base.html create mode 100644 examples/db_explorer/templates/database.html create mode 100644 examples/db_explorer/templates/homepage.html create mode 100644 examples/db_explorer/templates/table.html diff --git a/examples/db_explorer/.gitignore b/examples/db_explorer/.gitignore new file mode 100644 index 0000000..47def24 --- /dev/null +++ b/examples/db_explorer/.gitignore @@ -0,0 +1 @@ +/env/ diff --git a/examples/db_explorer/charts.py b/examples/db_explorer/charts.py new file mode 100644 index 0000000..0def735 --- /dev/null +++ b/examples/db_explorer/charts.py @@ -0,0 +1,38 @@ +import pygal +from pygal.style import RotateStyle +from jinja2.filters import do_filesizeformat + + +number_formatter = lambda v: '{:,}'.format(v) +bytes_formatter = lambda v: do_filesizeformat(v, True) + + +def tables_piechart(db, by_field, value_formatter): + Tables = db.get_model_for_table('tables', system_table=True) + qs = Tables.objects_in(db).filter(database=db.db_name, is_temporary=False).exclude(engine='Buffer') + tuples = [(getattr(table, by_field), table.name) for table in qs] + return _generate_piechart(tuples, value_formatter) + + +def columns_piechart(db, tbl_name, by_field, value_formatter): + ColumnsTable = db.get_model_for_table('columns', system_table=True) + qs = ColumnsTable.objects_in(db).filter(database=db.db_name, table=tbl_name) + tuples = [(getattr(col, by_field), col.name) for col in qs] + return _generate_piechart(tuples, value_formatter) + + +def _get_top_tuples(tuples, n=15): + non_zero_tuples = [t for t in tuples if t[0]] + sorted_tuples = sorted(non_zero_tuples, reverse=True) + if len(sorted_tuples) > n: + others = (sum(t[0] for t in sorted_tuples[n:]), 'others') + sorted_tuples = sorted_tuples[:n] + [others] + return sorted_tuples + + +def _generate_piechart(tuples, value_formatter): + style = RotateStyle('#9e6ffe', background='white', legend_font_family='Roboto', legend_font_size=18, tooltip_font_family='Roboto', tooltip_font_size=24) + chart = pygal.Pie(style=style, margin=0, title=' ', value_formatter=value_formatter, truncate_legend=-1) + for t in _get_top_tuples(tuples): + chart.add(t[1], t[0]) + return chart.render(is_unicode=True, disable_xml_declaration=True) diff --git a/examples/db_explorer/requirements.txt b/examples/db_explorer/requirements.txt new file mode 100644 index 0000000..8dee9f8 --- /dev/null +++ b/examples/db_explorer/requirements.txt @@ -0,0 +1,15 @@ +certifi==2020.4.5.2 +chardet==3.0.4 +click==7.1.2 +Flask==1.1.2 +idna==2.9 +infi.clickhouse-orm==2.0.1 +iso8601==0.1.12 +itsdangerous==1.1.0 +Jinja2==2.11.2 +MarkupSafe==1.1.1 +pygal==2.4.0 +pytz==2020.1 +requests==2.23.0 +urllib3==1.25.9 +Werkzeug==1.0.1 diff --git a/examples/db_explorer/server.py b/examples/db_explorer/server.py new file mode 100644 index 0000000..3336eb2 --- /dev/null +++ b/examples/db_explorer/server.py @@ -0,0 +1,63 @@ +from infi.clickhouse_orm import Database, F +from charts import tables_piechart, columns_piechart, number_formatter, bytes_formatter +from flask import Flask +from flask import render_template +import sys + + +app = Flask(__name__) + + +@app.route('/') +def homepage_view(): + db = _get_db('system') + DatabasesTable = db.get_model_for_table('databases', system_table=True) + databases = DatabasesTable.objects_in(db).exclude(name='system').order_by(F.lower(DatabasesTable.name)) + return render_template('homepage.html', db=db, databases=databases) + + +@app.route('//') +def database_view(db_name): + db = _get_db(db_name) + ColumnsTable = db.get_model_for_table('columns', system_table=True) + tables = ColumnsTable.objects_in(db).filter(database=db_name).aggregate(ColumnsTable.table, + compressed_size=F.sum(ColumnsTable.data_compressed_bytes), + uncompressed_size=F.sum(ColumnsTable.data_uncompressed_bytes), + ratio=F.sum(ColumnsTable.data_uncompressed_bytes) / F.sum(ColumnsTable.data_compressed_bytes) + ).order_by(F.lower(ColumnsTable.table)) + return render_template('database.html', + db=db, + tables=tables, + tables_piechart_by_rows=tables_piechart(db, 'total_rows', value_formatter=number_formatter), + tables_piechart_by_size=tables_piechart(db, 'total_bytes', value_formatter=bytes_formatter), + ) + + +@app.route('///') +def table_view(db_name, tbl_name): + db = _get_db(db_name) + TablesTable = db.get_model_for_table('tables', system_table=True) + tbl_info = TablesTable.objects_in(db).filter(database=db_name, name=tbl_name)[0] + create_table_sql = db.raw('SHOW CREATE TABLE %s FORMAT TabSeparatedRaw' % tbl_name) + ColumnsTable = db.get_model_for_table('columns', system_table=True) + columns = ColumnsTable.objects_in(db).filter(database=db_name, table=tbl_name) + return render_template('table.html', + db=db, + tbl_name=tbl_name, + tbl_info=tbl_info, + create_table_sql=create_table_sql, + columns=columns, + piechart=columns_piechart(db, tbl_name, 'data_compressed_bytes', value_formatter=bytes_formatter), + ) + + +def _get_db(db_name): + db_url = sys.argv[1] if len(sys.argv) > 1 else 'http://localhost:8123/' + username = sys.argv[2] if len(sys.argv) > 2 else None + password = sys.argv[3] if len(sys.argv) > 3 else None + return Database(db_name, db_url, username, password, readonly=True) + + +if __name__ == '__main__': + _get_db('system') # fail early on db connection problems + app.run(debug=True) diff --git a/examples/db_explorer/templates/base.html b/examples/db_explorer/templates/base.html new file mode 100644 index 0000000..0f343d7 --- /dev/null +++ b/examples/db_explorer/templates/base.html @@ -0,0 +1,22 @@ + + + + + ClickHouse Explorer + + + + + + + + +
+ + {% block contents %} + {% endblock %} + +
+ + + \ No newline at end of file diff --git a/examples/db_explorer/templates/database.html b/examples/db_explorer/templates/database.html new file mode 100644 index 0000000..9b72b7d --- /dev/null +++ b/examples/db_explorer/templates/database.html @@ -0,0 +1,54 @@ +{% extends "base.html" %} + +{% block contents %} + +

{{ db.db_name }}

+ +

+ Home + » + {{ db.db_name }} +

+ +
+ +
+

Top Tables by Size

+ {% autoescape false %} + {{ tables_piechart_by_size }} + {% endautoescape %} +
+ +
+

Top Tables by Rows

+ {% autoescape false %} + {{ tables_piechart_by_rows }} + {% endautoescape %} +
+ +
+ +

Tables ({{ tables.count() }})

+ + + + + + + + + + + + {% for table in tables %} + + + + + + {% endfor %} + +
NameUncompressed SizeCompressed SizeCompression Ratio
{{ table.table }} + {{ table.uncompressed_size|filesizeformat(true) }}{{ table.compressed_size|filesizeformat(true) }}{% if table.uncompressed_size %} {{ "%.2f" % table.ratio }} {% else %} 1 {% endif %} : 1
+ +{% endblock %} diff --git a/examples/db_explorer/templates/homepage.html b/examples/db_explorer/templates/homepage.html new file mode 100644 index 0000000..79997f7 --- /dev/null +++ b/examples/db_explorer/templates/homepage.html @@ -0,0 +1,41 @@ +{% extends "base.html" %} + +{% block contents %} + + +
+ +
+ +

ClickHouse Explorer

+ + + + + + + + + + + + + + +
URL{{ db.db_url }}
Version{{ db.server_version|join('.') }}
Timezone{{ db.server_timezone }}
+ +

Databases ({{ databases.count() }})

+
    + {% for d in databases %} +
  • + {{ d.name }} +
  • + {% endfor %} +
+ +
+ +
+ + +{% endblock %} diff --git a/examples/db_explorer/templates/table.html b/examples/db_explorer/templates/table.html new file mode 100644 index 0000000..682c275 --- /dev/null +++ b/examples/db_explorer/templates/table.html @@ -0,0 +1,79 @@ +{% extends "base.html" %} + +{% block contents %} + + +

+ Home + » + {{ db.db_name }} + » + {{ tbl_name }} +

+ +

{{ tbl_name }}

+ +
+ +
+

Details

+ + + + + + + + + + {% if tbl_info.total_rows %} + + + + + {% endif %} + + + + +
Total rows{{ "{:,}".format(tbl_info.total_rows) }}
Total size{{ tbl_info.total_bytes|filesizeformat(true) }}
Average row size{{ (tbl_info.total_bytes / tbl_info.total_rows)|filesizeformat(true) }}
Engine{{ tbl_info.engine }}
+
+ +
+

Top Columns by Size

+ {% autoescape false %} + {{ piechart }} + {% endautoescape %} +
+ +
+ +

Columns ({{ columns.count() }})

+ + + + + + + + + + + + + {% for col in columns %} + + + + + + + + {% endfor %} + +
NameTypeUncompressed SizeCompressed SizeCompression Ratio
{{ col.name }}{{ col.type }}{{ col.data_uncompressed_bytes|filesizeformat(true) }}{{ col.data_compressed_bytes|filesizeformat(true) }}{% if col.data_compressed_bytes %} {{ "%.2f" % (col.data_uncompressed_bytes / col.data_compressed_bytes) }} {% else %} 1 {% endif %} : 1
+ +

Table Definition

+
{{ create_table_sql }}
+ +{% endblock %} From 7acfc411d87ea3a766f70726cecb86e961cb8147 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 14 Jul 2020 22:01:32 +0300 Subject: [PATCH 24/27] Remove __future__ --- tests/base_test_with_data.py | 1 - tests/test_alias_fields.py | 1 - tests/test_array_fields.py | 1 - tests/test_buffer.py | 1 - tests/test_compressed_fields.py | 1 - tests/test_custom_fields.py | 1 - tests/test_database.py | 1 - tests/test_datetime_fields.py | 1 - tests/test_decimal_fields.py | 1 - tests/test_engines.py | 1 - tests/test_enum_fields.py | 1 - tests/test_fixed_string_fields.py | 1 - tests/test_inheritance.py | 1 - tests/test_ip_fields.py | 1 - tests/test_join.py | 1 - tests/test_materialized_fields.py | 1 - tests/test_migrations.py | 1 - tests/test_models.py | 1 - tests/test_nullable_fields.py | 1 - tests/test_querysets.py | 1 - tests/test_readonly.py | 1 - tests/test_server_errors.py | 1 - tests/test_simple_fields.py | 1 - tests/test_system_models.py | 2 -- tests/test_uuid_fields.py | 1 - 25 files changed, 26 deletions(-) diff --git a/tests/base_test_with_data.py b/tests/base_test_with_data.py index 66d6a25..53d2343 100644 --- a/tests/base_test_with_data.py +++ b/tests/base_test_with_data.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals import unittest from infi.clickhouse_orm.database import Database diff --git a/tests/test_alias_fields.py b/tests/test_alias_fields.py index 6f8f1f2..0039fa6 100644 --- a/tests/test_alias_fields.py +++ b/tests/test_alias_fields.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest from datetime import date diff --git a/tests/test_array_fields.py b/tests/test_array_fields.py index f63ac2f..b0b18a2 100644 --- a/tests/test_array_fields.py +++ b/tests/test_array_fields.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest from datetime import date diff --git a/tests/test_buffer.py b/tests/test_buffer.py index 2eb6eeb..14cc59a 100644 --- a/tests/test_buffer.py +++ b/tests/test_buffer.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals import unittest from infi.clickhouse_orm.models import BufferModel diff --git a/tests/test_compressed_fields.py b/tests/test_compressed_fields.py index 3c8282e..8d17571 100644 --- a/tests/test_compressed_fields.py +++ b/tests/test_compressed_fields.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest import datetime import pytz diff --git a/tests/test_custom_fields.py b/tests/test_custom_fields.py index c0b739c..641da27 100644 --- a/tests/test_custom_fields.py +++ b/tests/test_custom_fields.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest from infi.clickhouse_orm.database import Database from infi.clickhouse_orm.fields import Field, Int16Field diff --git a/tests/test_database.py b/tests/test_database.py index 2fd9864..38681d4 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals import unittest import datetime diff --git a/tests/test_datetime_fields.py b/tests/test_datetime_fields.py index 2f2300b..6c30ffb 100644 --- a/tests/test_datetime_fields.py +++ b/tests/test_datetime_fields.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest import datetime import pytz diff --git a/tests/test_decimal_fields.py b/tests/test_decimal_fields.py index 10be093..622c6d6 100644 --- a/tests/test_decimal_fields.py +++ b/tests/test_decimal_fields.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals import unittest from decimal import Decimal diff --git a/tests/test_engines.py b/tests/test_engines.py index 06c9f02..2fcc8c2 100644 --- a/tests/test_engines.py +++ b/tests/test_engines.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest import datetime diff --git a/tests/test_enum_fields.py b/tests/test_enum_fields.py index 34cd3d0..9ad4cdb 100644 --- a/tests/test_enum_fields.py +++ b/tests/test_enum_fields.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest from infi.clickhouse_orm.database import Database diff --git a/tests/test_fixed_string_fields.py b/tests/test_fixed_string_fields.py index e9e0124..a29f3ca 100644 --- a/tests/test_fixed_string_fields.py +++ b/tests/test_fixed_string_fields.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals import unittest from infi.clickhouse_orm.database import Database diff --git a/tests/test_inheritance.py b/tests/test_inheritance.py index e1d78e4..705a9b7 100644 --- a/tests/test_inheritance.py +++ b/tests/test_inheritance.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest import datetime import pytz diff --git a/tests/test_ip_fields.py b/tests/test_ip_fields.py index f829db2..220aa1b 100644 --- a/tests/test_ip_fields.py +++ b/tests/test_ip_fields.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest from ipaddress import IPv4Address, IPv6Address from infi.clickhouse_orm.database import Database diff --git a/tests/test_join.py b/tests/test_join.py index ce0ce03..48da1b3 100644 --- a/tests/test_join.py +++ b/tests/test_join.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals, print_function import unittest import json diff --git a/tests/test_materialized_fields.py b/tests/test_materialized_fields.py index af469cd..8893229 100644 --- a/tests/test_materialized_fields.py +++ b/tests/test_materialized_fields.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest from datetime import date diff --git a/tests/test_migrations.py b/tests/test_migrations.py index fa5aef2..d00357a 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest from infi.clickhouse_orm.database import Database, ServerError diff --git a/tests/test_models.py b/tests/test_models.py index 33fb6a7..579c2dd 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest import datetime import pytz diff --git a/tests/test_nullable_fields.py b/tests/test_nullable_fields.py index e65c2f8..ab7a777 100644 --- a/tests/test_nullable_fields.py +++ b/tests/test_nullable_fields.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest import pytz diff --git a/tests/test_querysets.py b/tests/test_querysets.py index c2e2b30..7f161e0 100644 --- a/tests/test_querysets.py +++ b/tests/test_querysets.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals, print_function import unittest from infi.clickhouse_orm.database import Database from infi.clickhouse_orm.query import Q diff --git a/tests/test_readonly.py b/tests/test_readonly.py index f7c8f49..bc9b252 100644 --- a/tests/test_readonly.py +++ b/tests/test_readonly.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals from infi.clickhouse_orm.database import DatabaseException, ServerError from .base_test_with_data import * diff --git a/tests/test_server_errors.py b/tests/test_server_errors.py index 7445e5f..60fbd2b 100644 --- a/tests/test_server_errors.py +++ b/tests/test_server_errors.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest from infi.clickhouse_orm.database import ServerError diff --git a/tests/test_simple_fields.py b/tests/test_simple_fields.py index 0e89ed5..247e096 100644 --- a/tests/test_simple_fields.py +++ b/tests/test_simple_fields.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest from infi.clickhouse_orm.fields import * from datetime import date, datetime diff --git a/tests/test_system_models.py b/tests/test_system_models.py index 1d8b8cc..5d3a862 100644 --- a/tests/test_system_models.py +++ b/tests/test_system_models.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import unittest from datetime import date diff --git a/tests/test_uuid_fields.py b/tests/test_uuid_fields.py index d81e8eb..284d8f5 100644 --- a/tests/test_uuid_fields.py +++ b/tests/test_uuid_fields.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import unittest from uuid import UUID from infi.clickhouse_orm.database import Database From 80b220c1e3279ab10688d86038e3cbcdbb8b64c2 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 14 Jul 2020 22:01:50 +0300 Subject: [PATCH 25/27] Added functions for working with external dictionaries --- CHANGELOG.md | 1 + docs/class_reference.md | 15 ++++ src/infi/clickhouse_orm/funcs.py | 22 ++++++ tests/test_dictionaries.py | 131 +++++++++++++++++++++++++++++++ 4 files changed, 169 insertions(+) create mode 100644 tests/test_dictionaries.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 90f49fe..1cfd096 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ Unreleased - Support for model constraints - Support for data skipping indexes - Support for mutations: `QuerySet.update` and `QuerySet.delete` +- Added functions for working with external dictionaries - Support FINAL for `ReplacingMergeTree` (chripede) - Added `DateTime64Field` (NiyazNz) - Make `DateTimeField` and `DateTime64Field` timezone-aware (NiyazNz) diff --git a/docs/class_reference.md b/docs/class_reference.md index cb718c5..08716a5 100644 --- a/docs/class_reference.md +++ b/docs/class_reference.md @@ -1913,6 +1913,21 @@ Initializer. #### covarSampOrNullIf(y, cond) +#### dictGet(attr_name, id_expr) + + +#### dictGetHierarchy(id_expr) + + +#### dictGetOrDefault(attr_name, id_expr, default) + + +#### dictHas(id_expr) + + +#### dictIsIn(child_id_expr, ancestor_id_expr) + + #### divide(**kwargs) diff --git a/src/infi/clickhouse_orm/funcs.py b/src/infi/clickhouse_orm/funcs.py index 2763fa7..d84c761 100644 --- a/src/infi/clickhouse_orm/funcs.py +++ b/src/infi/clickhouse_orm/funcs.py @@ -1789,6 +1789,28 @@ class F(Cond, FunctionOperatorsMixin, metaclass=FMeta): def greatest(x, y): return F('greatest', x, y) + # Dictionary functions + + @staticmethod + def dictGet(dict_name, attr_name, id_expr): + return F('dictGet', dict_name, attr_name, id_expr) + + @staticmethod + def dictGetOrDefault(dict_name, attr_name, id_expr, default): + return F('dictGetOrDefault', dict_name, attr_name, id_expr, default) + + @staticmethod + def dictHas(dict_name, id_expr): + return F('dictHas', dict_name, id_expr) + + @staticmethod + def dictGetHierarchy(dict_name, id_expr): + return F('dictGetHierarchy', dict_name, id_expr) + + @staticmethod + def dictIsIn(dict_name, child_id_expr, ancestor_id_expr): + return F('dictIsIn', dict_name, child_id_expr, ancestor_id_expr) + # Expose only relevant classes in import * __all__ = ['F'] diff --git a/tests/test_dictionaries.py b/tests/test_dictionaries.py new file mode 100644 index 0000000..7da4160 --- /dev/null +++ b/tests/test_dictionaries.py @@ -0,0 +1,131 @@ +import unittest +import logging + +from infi.clickhouse_orm import * + + +class DictionaryTestMixin: + + def setUp(self): + self.database = Database('test-db', log_statements=True) + if self.database.server_version < (20, 1, 11, 73): + raise unittest.SkipTest('ClickHouse version too old') + self._create_dictionary() + + def tearDown(self): + self.database.drop_database() + + def _test_func(self, func, expected_value): + sql = 'SELECT %s AS value' % func.to_sql() + logging.info(sql) + result = list(self.database.select(sql)) + logging.info('\t==> %s', result[0].value if result else '') + print('Comparing %s to %s' % (result[0].value, expected_value)) + self.assertEqual(result[0].value, expected_value) + return result[0].value if result else None + + +class SimpleDictionaryTest(DictionaryTestMixin, unittest.TestCase): + + def _create_dictionary(self): + # Create a table to be used as source for the dictionary + self.database.create_table(NumberName) + self.database.insert( + NumberName(number=i, name=name) + for i, name in enumerate('Zero One Two Three Four Five Six Seven Eight Nine Ten'.split()) + ) + # Create the dictionary + self.database.raw(""" + CREATE DICTIONARY numbers_dict( + number UInt64, + name String DEFAULT '?' + ) + PRIMARY KEY number + SOURCE(CLICKHOUSE( + HOST 'localhost' PORT 9000 USER 'default' PASSWORD '' DB 'test-db' TABLE 'numbername' + )) + LIFETIME(100) + LAYOUT(HASHED()); + """) + self.dict_name = 'test-db.numbers_dict' + + def test_dictget(self): + self._test_func(F.dictGet(self.dict_name, 'name', F.toUInt64(3)), 'Three') + self._test_func(F.dictGet(self.dict_name, 'name', F.toUInt64(99)), '?') + + def test_dictgetordefault(self): + self._test_func(F.dictGetOrDefault(self.dict_name, 'name', F.toUInt64(3), 'n/a'), 'Three') + self._test_func(F.dictGetOrDefault(self.dict_name, 'name', F.toUInt64(99), 'n/a'), 'n/a') + + def test_dicthas(self): + self._test_func(F.dictHas(self.dict_name, F.toUInt64(3)), 1) + self._test_func(F.dictHas(self.dict_name, F.toUInt64(99)), 0) + + +class HierarchicalDictionaryTest(DictionaryTestMixin, unittest.TestCase): + + def _create_dictionary(self): + # Create a table to be used as source for the dictionary + self.database.create_table(Region) + self.database.insert([ + Region(region_id=1, parent_region=0, region_name='Russia'), + Region(region_id=2, parent_region=1, region_name='Moscow'), + Region(region_id=3, parent_region=2, region_name='Center'), + Region(region_id=4, parent_region=0, region_name='Great Britain'), + Region(region_id=5, parent_region=4, region_name='London'), + ]) + # Create the dictionary + self.database.raw(""" + CREATE DICTIONARY regions_dict( + region_id UInt64, + parent_region UInt64 HIERARCHICAL, + region_name String DEFAULT '?' + ) + PRIMARY KEY region_id + SOURCE(CLICKHOUSE( + HOST 'localhost' PORT 9000 USER 'default' PASSWORD '' DB 'test-db' TABLE 'region' + )) + LIFETIME(100) + LAYOUT(HASHED()); + """) + self.dict_name = 'test-db.regions_dict' + + def test_dictget(self): + self._test_func(F.dictGet(self.dict_name, 'region_name', F.toUInt64(3)), 'Center') + self._test_func(F.dictGet(self.dict_name, 'parent_region', F.toUInt64(3)), 2) + self._test_func(F.dictGet(self.dict_name, 'region_name', F.toUInt64(99)), '?') + + def test_dictgetordefault(self): + self._test_func(F.dictGetOrDefault(self.dict_name, 'region_name', F.toUInt64(3), 'n/a'), 'Center') + self._test_func(F.dictGetOrDefault(self.dict_name, 'region_name', F.toUInt64(99), 'n/a'), 'n/a') + + def test_dicthas(self): + self._test_func(F.dictHas(self.dict_name, F.toUInt64(3)), 1) + self._test_func(F.dictHas(self.dict_name, F.toUInt64(99)), 0) + + def test_dictgethierarchy(self): + self._test_func(F.dictGetHierarchy(self.dict_name, F.toUInt64(3)), [3, 2, 1]) + self._test_func(F.dictGetHierarchy(self.dict_name, F.toUInt64(99)), [99]) + + def test_dictisin(self): + self._test_func(F.dictIsIn(self.dict_name, F.toUInt64(3), F.toUInt64(1)), 1) + self._test_func(F.dictIsIn(self.dict_name, F.toUInt64(3), F.toUInt64(4)), 0) + self._test_func(F.dictIsIn(self.dict_name, F.toUInt64(99), F.toUInt64(4)), 0) + + +class NumberName(Model): + ''' A table to act as a source for the dictionary ''' + + number = UInt64Field() + name = StringField() + + engine = Memory() + + +class Region(Model): + + region_id = UInt64Field() + parent_region = UInt64Field() + region_name = StringField() + + engine = Memory() From 97773d6dc76adb8304c57b948ca3bf3b48186447 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Wed, 15 Jul 2020 23:42:40 +0300 Subject: [PATCH 26/27] Added usage examples --- examples/db_explorer/README.md | 36 ++++++++++++++++++++++++++++++++++ examples/db_explorer/charts.py | 24 +++++++++++++++++++++++ examples/db_explorer/server.py | 30 +++++++++++++++++++++++++--- 3 files changed, 87 insertions(+), 3 deletions(-) create mode 100644 examples/db_explorer/README.md diff --git a/examples/db_explorer/README.md b/examples/db_explorer/README.md new file mode 100644 index 0000000..1ef0bff --- /dev/null +++ b/examples/db_explorer/README.md @@ -0,0 +1,36 @@ +# DB Explorer + +This is a simple Flask web application that connects to ClickHouse and displays the list of existing databases. Clicking on a database name drills down into it, showing its list of tables. Clicking on a table drills down further, showing details about the table and its columns. + +For each table or column, the application displays the compressed size on disk, the uncompressed size, and the ratio between them. Additionally, several pie charts are shown - top tables by size, top tables by rows, and top columns by size (in a table). + +The pie charts are generated using the `pygal` charting library. + +ORM concepts that are demonstrated by this example: + +- Creating ORM models from existing tables using `Database.get_model_for_table` +- Queryset filtering +- Queryset aggregation + +## Running the code + +Create a virtualenv and install the required libraries: +``` +virtualenv -p python3.6 env +source env/bin/activate +pip install -r requirements.txt +``` + +Run the server and open http://127.0.0.1:5000/ in your browser: +``` +python server.py +``` + +By default the server connects to ClickHouse running on http://localhost:8123/ without a username or password, but you can change this using command line arguments: +``` +python server.py http://myclickhouse:8123/ +``` +or: +``` +python server.py http://myclickhouse:8123/ admin secret123 +``` diff --git a/examples/db_explorer/charts.py b/examples/db_explorer/charts.py index 0def735..1a690e6 100644 --- a/examples/db_explorer/charts.py +++ b/examples/db_explorer/charts.py @@ -3,11 +3,18 @@ from pygal.style import RotateStyle from jinja2.filters import do_filesizeformat +# Formatting functions number_formatter = lambda v: '{:,}'.format(v) bytes_formatter = lambda v: do_filesizeformat(v, True) def tables_piechart(db, by_field, value_formatter): + ''' + Generate a pie chart of the top n tables in the database. + `db` - the database instance + `by_field` - the field name to sort by + `value_formatter` - a function to use for formatting the numeric values + ''' Tables = db.get_model_for_table('tables', system_table=True) qs = Tables.objects_in(db).filter(database=db.db_name, is_temporary=False).exclude(engine='Buffer') tuples = [(getattr(table, by_field), table.name) for table in qs] @@ -15,6 +22,13 @@ def tables_piechart(db, by_field, value_formatter): def columns_piechart(db, tbl_name, by_field, value_formatter): + ''' + Generate a pie chart of the top n columns in the table. + `db` - the database instance + `tbl_name` - the table name + `by_field` - the field name to sort by + `value_formatter` - a function to use for formatting the numeric values + ''' ColumnsTable = db.get_model_for_table('columns', system_table=True) qs = ColumnsTable.objects_in(db).filter(database=db.db_name, table=tbl_name) tuples = [(getattr(col, by_field), col.name) for col in qs] @@ -22,6 +36,11 @@ def columns_piechart(db, tbl_name, by_field, value_formatter): def _get_top_tuples(tuples, n=15): + ''' + Given a list of tuples (value, name), this function sorts + the list and returns only the top n results. All other tuples + are aggregated to a single "others" tuple. + ''' non_zero_tuples = [t for t in tuples if t[0]] sorted_tuples = sorted(non_zero_tuples, reverse=True) if len(sorted_tuples) > n: @@ -31,6 +50,11 @@ def _get_top_tuples(tuples, n=15): def _generate_piechart(tuples, value_formatter): + ''' + Generates a pie chart. + `tuples` - a list of (value, name) tuples to include in the chart + `value_formatter` - a function to use for formatting the values + ''' style = RotateStyle('#9e6ffe', background='white', legend_font_family='Roboto', legend_font_size=18, tooltip_font_family='Roboto', tooltip_font_size=24) chart = pygal.Pie(style=style, margin=0, title=' ', value_formatter=value_formatter, truncate_legend=-1) for t in _get_top_tuples(tuples): diff --git a/examples/db_explorer/server.py b/examples/db_explorer/server.py index 3336eb2..6241ed9 100644 --- a/examples/db_explorer/server.py +++ b/examples/db_explorer/server.py @@ -10,21 +10,34 @@ app = Flask(__name__) @app.route('/') def homepage_view(): + ''' + Root view that lists all databases. + ''' db = _get_db('system') + # Get all databases in the system.databases table DatabasesTable = db.get_model_for_table('databases', system_table=True) - databases = DatabasesTable.objects_in(db).exclude(name='system').order_by(F.lower(DatabasesTable.name)) + databases = DatabasesTable.objects_in(db).exclude(name='system') + databases = databases.order_by(F.lower(DatabasesTable.name)) + # Generate the page return render_template('homepage.html', db=db, databases=databases) @app.route('//') def database_view(db_name): + ''' + A view that displays information about a single database. + ''' db = _get_db(db_name) + # Get all the tables in the database, by aggregating information from system.columns ColumnsTable = db.get_model_for_table('columns', system_table=True) - tables = ColumnsTable.objects_in(db).filter(database=db_name).aggregate(ColumnsTable.table, + tables = ColumnsTable.objects_in(db).filter(database=db_name).aggregate( + ColumnsTable.table, compressed_size=F.sum(ColumnsTable.data_compressed_bytes), uncompressed_size=F.sum(ColumnsTable.data_uncompressed_bytes), ratio=F.sum(ColumnsTable.data_uncompressed_bytes) / F.sum(ColumnsTable.data_compressed_bytes) - ).order_by(F.lower(ColumnsTable.table)) + ) + tables = tables.order_by(F.lower(ColumnsTable.table)) + # Generate the page return render_template('database.html', db=db, tables=tables, @@ -35,12 +48,19 @@ def database_view(db_name): @app.route('///') def table_view(db_name, tbl_name): + ''' + A view that displays information about a single table. + ''' db = _get_db(db_name) + # Get table information from system.tables TablesTable = db.get_model_for_table('tables', system_table=True) tbl_info = TablesTable.objects_in(db).filter(database=db_name, name=tbl_name)[0] + # Get the SQL used for creating the table create_table_sql = db.raw('SHOW CREATE TABLE %s FORMAT TabSeparatedRaw' % tbl_name) + # Get all columns in the table from system.columns ColumnsTable = db.get_model_for_table('columns', system_table=True) columns = ColumnsTable.objects_in(db).filter(database=db_name, table=tbl_name) + # Generate the page return render_template('table.html', db=db, tbl_name=tbl_name, @@ -52,6 +72,10 @@ def table_view(db_name, tbl_name): def _get_db(db_name): + ''' + Returns a Database instance using connection information + from the command line arguments (optional). + ''' db_url = sys.argv[1] if len(sys.argv) > 1 else 'http://localhost:8123/' username = sys.argv[2] if len(sys.argv) > 2 else None password = sys.argv[3] if len(sys.argv) > 3 else None From db194d733fbc412b9443d5eebfb54377681005f5 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Thu, 16 Jul 2020 07:21:35 +0300 Subject: [PATCH 27/27] Releasing v2.1.0 --- CHANGELOG.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1cfd096..c3286b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ Change Log ========== -Unreleased ----------- +v2.1.0 +------ - Support for model constraints - Support for data skipping indexes - Support for mutations: `QuerySet.update` and `QuerySet.delete` @@ -11,9 +11,9 @@ Unreleased - Added `DateTime64Field` (NiyazNz) - Make `DateTimeField` and `DateTime64Field` timezone-aware (NiyazNz) -**Backwards incompatibile changes** +**Backwards incompatible changes** -Previously, `DateTimeField` always converted its value from the database timezone to UTC. This is no longer the case: the field's value now preserves the timezone it was defined with, or if not specified - the database's global timezone. This change has no effect if your database timezone is set UTC. +Previously, `DateTimeField` always converted its value from the database timezone to UTC. This is no longer the case: the field's value now preserves the timezone it was defined with, or if not specified - the database's global timezone. This change has no effect if your database timezone is set to UTC. v2.0.1 ------