From 01cd88a93806b1ccbcaedca6c4ec7e0db70316bd Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Wed, 25 Jan 2017 16:41:01 +0200 Subject: [PATCH 01/11] TRIVIAL --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index fec606b..8e8c263 100644 --- a/README.rst +++ b/README.rst @@ -152,7 +152,7 @@ Pagination It is possible to paginate through model instances:: >>> order_by = 'first_name, last_name' - >>> page = db.paginate(Person, order_by, page_num=1, page_size=100) + >>> page = db.paginate(Person, order_by, page_num=1, page_size=10) >>> print page.number_of_objects 2507 >>> print page.pages_total From ca341ea997129e5d820b1397fa0cf5394b164571 Mon Sep 17 00:00:00 2001 From: M1ha Date: Thu, 26 Jan 2017 15:42:33 +0500 Subject: [PATCH 02/11] Added MaterializedField and AliasField --- src/infi/clickhouse_orm/database.py | 4 +- src/infi/clickhouse_orm/fields.py | 76 +++++++++++++++++++++++++++++ src/infi/clickhouse_orm/models.py | 9 +++- tests/sample_migrations/0008.py | 6 +++ tests/sample_migrations/0009.py | 6 +++ tests/test_alias_fields.py | 57 ++++++++++++++++++++++ tests/test_materialized_fields.py | 57 ++++++++++++++++++++++ tests/test_migrations.py | 31 ++++++++++++ 8 files changed, 242 insertions(+), 4 deletions(-) create mode 100644 tests/sample_migrations/0008.py create mode 100644 tests/sample_migrations/0009.py create mode 100644 tests/test_alias_fields.py create mode 100644 tests/test_materialized_fields.py diff --git a/src/infi/clickhouse_orm/database.py b/src/infi/clickhouse_orm/database.py index 9ce61a0..73bbc13 100644 --- a/src/infi/clickhouse_orm/database.py +++ b/src/infi/clickhouse_orm/database.py @@ -50,11 +50,11 @@ class Database(object): model_class = first_instance.__class__ def gen(): yield self._substitute('INSERT INTO $table FORMAT TabSeparated\n', model_class).encode('utf-8') - yield (first_instance.to_tsv() + '\n').encode('utf-8') + yield (first_instance.to_tsv(insertable_only=True) + '\n').encode('utf-8') # Collect lines in batches of batch_size batch = [] for instance in i: - batch.append(instance.to_tsv()) + batch.append(instance.to_tsv(insertable_only=True)) if len(batch) >= batch_size: # Return the current batch of lines yield ('\n'.join(batch) + '\n').encode('utf-8') diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index e4a5615..947c56a 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -12,6 +12,9 @@ class Field(object): class_default = 0 db_type = None + # This flag indicates, if we should take this field value when inserting data + insertable = True + def __init__(self, default=None): self.creation_counter = Field.creation_counter Field.creation_counter += 1 @@ -295,3 +298,76 @@ class ArrayField(Field): def get_sql(self, with_default=True): from .utils import escape return 'Array(%s)' % self.inner_field.get_sql(with_default=False) + + +class RelativeField(Field): + insertable = False + + def __init__(self, inner_field): + """ + Creates MATERIALIZED or ALIAS field + :param inner_field: Field subclass this field is acting like + """ + assert isinstance(inner_field, Field), "field must be Field subclass" + self.class_default = inner_field.class_default + self.default = inner_field.default + super(RelativeField, self).__init__() + self.inner_field = inner_field + + def to_python(self, value): + return self.inner_field.to_python(value) + + def validate(self, value): + return self.inner_field.validate(value) + + def to_db_string(self, value, quote=True): + return self.inner_field.to_db_string(value, quote=quote) + + +class MaterializedField(RelativeField): + """ + Creates ClickHouse MATERIALIZED field. It doesn't contain real data in database, it is counted on the spot + https://clickhouse.yandex/reference_en.html#Default values + """ + + def __init__(self, inner_field, code): + """ + Creates MATERIALIZED field + :param inner_field: Field subclass this field is acting like + :param code: ClickHouse code to execute when materialized field is called. See ClickHouse docs. + """ + super(MaterializedField, self).__init__(inner_field) + + self._code = code + + def get_sql(self, with_default=True): + """ + Generates SQL for create table command + :param with_default: This flag is inherited from Field model. Does nothing (MATERIALIZED have no default) + :return: Creation SQL string + """ + return '%s MATERIALIZED %s' % (self.inner_field.db_type, self._code) + + +class AliasField(RelativeField): + """ + Creates ClickHouse ALIAS field. It doesn't contain real data in database, only copies other one + https://clickhouse.yandex/reference_en.html#Default values + """ + + def __init__(self, inner_field, base_field_name): + """ + Creates ALIAS field + :param inner_field: Field instance this field is acting like + :param base_field_name: Name of field, to which alias is built + """ + super(AliasField, self).__init__(inner_field) + self.base_field_name = base_field_name + + def get_sql(self, with_default=True): + """ + Generates SQL for create table command + :param with_default: This flag is inherited from Field model. Does nothing (ALIAS have no default) + :return: Creation SQL string + """ + return '%s ALIAS %s' % (self.inner_field.db_type, self.base_field_name) diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index 6fae876..2e7f836 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -150,9 +150,14 @@ class Model(with_metaclass(ModelBase)): kwargs[name] = next(values) return cls(**kwargs) - def to_tsv(self): + def to_tsv(self, insertable_only=False): ''' Returns the instance's column values as a tab-separated line. A newline is not included. + :param bool insertable_only: If True, returns only fields, that can be inserted into database ''' data = self.__dict__ - return '\t'.join(field.to_db_string(data[name], quote=False) for name, field in self._fields) + + fields = [f for f in self._fields if f[1].insertable] if insertable_only else self._fields + return '\t'.join(field.to_db_string(data[name], quote=False) for name, field in fields) + + diff --git a/tests/sample_migrations/0008.py b/tests/sample_migrations/0008.py new file mode 100644 index 0000000..691a762 --- /dev/null +++ b/tests/sample_migrations/0008.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.CreateTable(MaterializedModel) +] \ No newline at end of file diff --git a/tests/sample_migrations/0009.py b/tests/sample_migrations/0009.py new file mode 100644 index 0000000..7841f17 --- /dev/null +++ b/tests/sample_migrations/0009.py @@ -0,0 +1,6 @@ +from infi.clickhouse_orm import migrations +from ..test_migrations import * + +operations = [ + migrations.CreateTable(AliasModel) +] \ No newline at end of file diff --git a/tests/test_alias_fields.py b/tests/test_alias_fields.py new file mode 100644 index 0000000..87a54e6 --- /dev/null +++ b/tests/test_alias_fields.py @@ -0,0 +1,57 @@ +import unittest +from datetime import date + +from infi.clickhouse_orm.database import Database +from infi.clickhouse_orm.models import Model +from infi.clickhouse_orm.fields import * +from infi.clickhouse_orm.engines import * + + +class MaterializedFieldsTest(unittest.TestCase): + + def setUp(self): + self.database = Database('test-db') + self.database.create_table(ModelWithAliasFields) + + def tearDown(self): + self.database.drop_database() + + def test_insert_and_select(self): + instance = ModelWithAliasFields( + date_field='2016-08-30', + int_field=-10, + str_field='TEST' + ) + self.database.insert([instance]) + # We can't select * from table, as it doesn't select materialized and alias fields + query = 'SELECT date_field, int_field, str_field, alias_int, alias_date, alias_str' \ + ' FROM $db.%s ORDER BY alias_date' % ModelWithAliasFields.table_name() + for model_cls in (ModelWithAliasFields, None): + results = list(self.database.select(query, model_cls)) + self.assertEquals(len(results), 1) + self.assertEquals(results[0].date_field, instance.date_field) + self.assertEquals(results[0].int_field, instance.int_field) + self.assertEquals(results[0].str_field, instance.str_field) + self.assertEquals(results[0].alias_int, instance.int_field) + self.assertEquals(results[0].alias_str, instance.str_field) + self.assertEquals(results[0].alias_date, instance.date_field) + + def test_assignment_error(self): + # I can't prevent assigning at all, in case db.select statements with model provided sets model fields. + instance = ModelWithAliasFields() + for value in ('x', [date.today()], ['aaa'], [None]): + with self.assertRaises(ValueError): + instance.alias_date = value + + +class ModelWithAliasFields(Model): + int_field = Int32Field() + date_field = DateField() + str_field = StringField() + + alias_str = AliasField(StringField(), 'str_field') + alias_int = MaterializedField(Int32Field(), 'int_field') + alias_date = MaterializedField(DateField(), 'date_field') + + engine = MergeTree('date_field', ('date_field',)) + diff --git a/tests/test_materialized_fields.py b/tests/test_materialized_fields.py new file mode 100644 index 0000000..855e5fd --- /dev/null +++ b/tests/test_materialized_fields.py @@ -0,0 +1,57 @@ +import unittest +from datetime import date + +from infi.clickhouse_orm.database import Database +from infi.clickhouse_orm.models import Model +from infi.clickhouse_orm.fields import * +from infi.clickhouse_orm.engines import * + + +class MaterializedFieldsTest(unittest.TestCase): + + def setUp(self): + self.database = Database('test-db') + self.database.create_table(ModelWithMaterializedFields) + + def tearDown(self): + self.database.drop_database() + + def test_insert_and_select(self): + instance = ModelWithMaterializedFields( + date_time_field='2016-08-30 11:00:00', + int_field=-10, + str_field='TEST' + ) + self.database.insert([instance]) + # We can't select * from table, as it doesn't select materialized and alias fields + query = 'SELECT date_time_field, int_field, str_field, mat_int, mat_date, mat_str' \ + ' FROM $db.%s ORDER BY mat_date' % ModelWithMaterializedFields.table_name() + for model_cls in (ModelWithMaterializedFields, None): + results = list(self.database.select(query, model_cls)) + self.assertEquals(len(results), 1) + self.assertEquals(results[0].date_time_field, instance.date_time_field) + self.assertEquals(results[0].int_field, instance.int_field) + self.assertEquals(results[0].str_field, instance.str_field) + self.assertEquals(results[0].mat_int, abs(instance.int_field)) + self.assertEquals(results[0].mat_str, instance.str_field.lower()) + self.assertEquals(results[0].mat_date, instance.date_time_field.date()) + + def test_assignment_error(self): + # I can't prevent assigning at all, in case db.select statements with model provided sets model fields. + instance = ModelWithMaterializedFields() + for value in ('x', [date.today()], ['aaa'], [None]): + with self.assertRaises(ValueError): + instance.mat_date = value + + +class ModelWithMaterializedFields(Model): + int_field = Int32Field() + date_time_field = DateTimeField() + str_field = StringField() + + mat_str = MaterializedField(StringField(), 'lower(str_field)') + mat_int = MaterializedField(Int32Field(), 'abs(int_field)') + mat_date = MaterializedField(DateField(), 'toDate(date_time_field)') + + engine = MergeTree('mat_date', ('mat_date',)) + diff --git a/tests/test_migrations.py b/tests/test_migrations.py index 39bcb55..0ccf425 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -60,6 +60,15 @@ class MigrationsTestCase(unittest.TestCase): self.assertTrue(self.tableExists(EnumModel1)) self.assertEquals(self.getTableFields(EnumModel2), [('date', 'Date'), ('f1', "Enum16('dog' = 1, 'cat' = 2, 'horse' = 3, 'pig' = 4)")]) + self.database.migrate('tests.sample_migrations', 8) + self.assertTrue(self.tableExists(MaterializedModel)) + self.assertEquals(self.getTableFields(MaterializedModel), + [('date_time', "DateTime"), ('date', 'Date')]) + self.database.migrate('tests.sample_migrations', 9) + self.assertTrue(self.tableExists(AliasModel)) + self.assertEquals(self.getTableFields(AliasModel), + [('date', 'Date'), ('date_alias', "Date")]) + # Several different models with the same table name, to simulate a table that changes over time @@ -127,3 +136,25 @@ class EnumModel2(Model): @classmethod def table_name(cls): return 'enum_mig' + + +class MaterializedModel(Model): + date_time = DateTimeField() + date = MaterializedField(DateField(), 'toDate(date_time)') + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'materalized_date' + + +class AliasModel(Model): + date = DateField() + date_alias = AliasField(DateField(), 'date') + + engine = MergeTree('date', ('date',)) + + @classmethod + def table_name(cls): + return 'alias_date' \ No newline at end of file From 41e73a5cbb112325a6c504193f7d56c947e044be Mon Sep 17 00:00:00 2001 From: M1ha Date: Thu, 26 Jan 2017 15:48:01 +0500 Subject: [PATCH 03/11] Forgot readme --- README.rst | 76 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 20 deletions(-) diff --git a/README.rst b/README.rst index 8e8c263..7d3e9d7 100644 --- a/README.rst +++ b/README.rst @@ -189,26 +189,28 @@ Field Types Currently the following field types are supported: -============= ======== ================= =================================================== -Class DB Type Pythonic Type Comments -============= ======== ================= =================================================== -StringField String unicode Encoded as UTF-8 when written to ClickHouse -DateField Date datetime.date Range 1970-01-01 to 2038-01-19 -DateTimeField DateTime datetime.datetime Minimal value is 1970-01-01 00:00:00; Always in UTC -Int8Field Int8 int Range -128 to 127 -Int16Field Int16 int Range -32768 to 32767 -Int32Field Int32 int Range -2147483648 to 2147483647 -Int64Field Int64 int/long Range -9223372036854775808 to 9223372036854775807 -UInt8Field UInt8 int Range 0 to 255 -UInt16Field UInt16 int Range 0 to 65535 -UInt32Field UInt32 int Range 0 to 4294967295 -UInt64Field UInt64 int/long Range 0 to 18446744073709551615 -Float32Field Float32 float -Float64Field Float64 float -Enum8Field Enum8 Enum See below -Enum16Field Enum16 Enum See below -ArrayField Array list See below -============= ======== ================= =================================================== +=================== ======== ================= =================================================== +Class DB Type Pythonic Type Comments +=================== ======== ================= =================================================== +StringField String unicode Encoded as UTF-8 when written to ClickHouse +DateField Date datetime.date Range 1970-01-01 to 2038-01-19 +DateTimeField DateTime datetime.datetime Minimal value is 1970-01-01 00:00:00; Always in UTC +Int8Field Int8 int Range -128 to 127 +Int16Field Int16 int Range -32768 to 32767 +Int32Field Int32 int Range -2147483648 to 2147483647 +Int64Field Int64 int/long Range -9223372036854775808 to 9223372036854775807 +UInt8Field UInt8 int Range 0 to 255 +UInt16Field UInt16 int Range 0 to 65535 +UInt32Field UInt32 int Range 0 to 4294967295 +UInt64Field UInt64 int/long Range 0 to 18446744073709551615 +Float32Field Float32 float +Float64Field Float64 float +Enum8Field Enum8 Enum See below +Enum16Field Enum16 Enum See below +ArrayField Array list See below +AliasField See below See below See below +MaterializedField See below See below See below +=================== ========== ================= =================================================== Working with enum fields ************************ @@ -249,6 +251,40 @@ You can create array fields containing any data type, for example:: data = SensorData(date=date.today(), temperatures=[25.5, 31.2, 28.7], humidity_levels=[41, 39, 66]) + +Working with materialized and alias fields +****************************************** + +ClickHouse provides an opportunity to create MATERIALIZED and ALIAS Fields. + +See documentation `here `. + +Both field types can't be inserted into database directly. +These field values are ignored, when using database.insert() method. +These fields are set to default values if you use database.select('SELECT * FROM mymodel', model_class=MyModel), +because ClickHouse doesn't return them. +Nevertheless, attribute values (as well as defaults) can be set for model object from python. + +Usage:: + + class Event(models.Model): + + created = fields.DateTimeField() + created_date = fields.MaterializedField(fields.DateTimeField(), 'toDate(created)') + name = StringField() + username = AliasField(StringField(), 'name') + + engine = engines.MergeTree('created_date', ('created_date', 'created')) + + obj = Event(created=datetime.now(), name='MyEvent') + db = Database('my_test_db') + db.insert([obj]) + # All values will be retrieved from database + db.select('SELECT created, created_date, username, name FROM $db.event', model_class=Event) + # created_date, username will contain default value + db.select('SELECT * FROM $db.event', model_class=Event) + + Table Engines ------------- From 2509b5b2e33aa52e3df21f0ee067f85a915f49cd Mon Sep 17 00:00:00 2001 From: M1ha Date: Fri, 27 Jan 2017 10:42:37 +0500 Subject: [PATCH 04/11] Rewritten Alias and Materialized fields to field parameters like default. --- README.rst | 10 +-- src/infi/clickhouse_orm/fields.py | 101 +++++++----------------------- src/infi/clickhouse_orm/models.py | 2 +- tests/test_alias_fields.py | 18 +++++- tests/test_materialized_fields.py | 18 +++++- tests/test_migrations.py | 4 +- 6 files changed, 59 insertions(+), 94 deletions(-) diff --git a/README.rst b/README.rst index 7d3e9d7..70ab8af 100644 --- a/README.rst +++ b/README.rst @@ -31,6 +31,8 @@ Models are defined in a way reminiscent of Django's ORM:: engine = engines.MergeTree('birthday', ('first_name', 'last_name', 'birthday')) It is possible to provide a default value for a field, instead of its "natural" default (empty string for string fields, zero for numeric fields etc.). +It is always possible to pass alias or materialized parameters. See below for usage examples. +Only one of default, alias and materialized parameters can be provided See below for the supported field types and table engines. @@ -208,8 +210,6 @@ Float64Field Float64 float Enum8Field Enum8 Enum See below Enum16Field Enum16 Enum See below ArrayField Array list See below -AliasField See below See below See below -MaterializedField See below See below See below =================== ========== ================= =================================================== Working with enum fields @@ -270,9 +270,9 @@ Usage:: class Event(models.Model): created = fields.DateTimeField() - created_date = fields.MaterializedField(fields.DateTimeField(), 'toDate(created)') - name = StringField() - username = AliasField(StringField(), 'name') + created_date = fields.DateTimeField(materialized='toDate(created)') + name = fields.StringField() + username = fields.StringField(alias='name') engine = engines.MergeTree('created_date', ('created_date', 'created')) diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 947c56a..51b67de 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -12,13 +12,17 @@ class Field(object): class_default = 0 db_type = None - # This flag indicates, if we should take this field value when inserting data - insertable = True + def __init__(self, default=None, alias=None, materialized=None): + assert (None, None) in {(default, alias), (alias, materialized), (default, materialized)}, \ + "Only one of default, alias and materialized parameters can be given" + assert alias is None or isinstance(alias, str), "Alias field must be string field name, if given" + assert materialized is None or isinstance(materialized, str), "Materialized field must be string, if given" - def __init__(self, default=None): self.creation_counter = Field.creation_counter Field.creation_counter += 1 self.default = self.class_default if default is None else default + self.alias = alias + self.materialized = materialized def to_python(self, value): ''' @@ -51,13 +55,22 @@ class Field(object): def get_sql(self, with_default=True): ''' Returns an SQL expression describing the field (e.g. for CREATE TABLE). + :param with_default: If True, adds default value to sql. + It doesn't affect fields with alias and materialized values. ''' - if with_default: + if self.alias: + return '%s ALIAS %s' % (self.db_type, self.alias) + elif self.materialized: + return '%s MATERIALIZED %s' % (self.db_type, self.materialized) + elif with_default: default = self.to_db_string(self.default) return '%s DEFAULT %s' % (self.db_type, default) else: return self.db_type + def is_insertable(self): + return self.alias is None and self.materialized is None + class StringField(Field): @@ -210,11 +223,11 @@ class Float64Field(BaseFloatField): class BaseEnumField(Field): - def __init__(self, enum_cls, default=None): + def __init__(self, enum_cls, default=None, alias=None, materialized=None): self.enum_cls = enum_cls if default is None: default = list(enum_cls)[0] - super(BaseEnumField, self).__init__(default) + super(BaseEnumField, self).__init__(default, alias, materialized) def to_python(self, value): if isinstance(value, self.enum_cls): @@ -274,9 +287,9 @@ class ArrayField(Field): class_default = [] - def __init__(self, inner_field, default=None): + def __init__(self, inner_field, default=None, alias=None, materialized=None): self.inner_field = inner_field - super(ArrayField, self).__init__(default) + super(ArrayField, self).__init__(default, alias, materialized) def to_python(self, value): if isinstance(value, text_type): @@ -299,75 +312,3 @@ class ArrayField(Field): from .utils import escape return 'Array(%s)' % self.inner_field.get_sql(with_default=False) - -class RelativeField(Field): - insertable = False - - def __init__(self, inner_field): - """ - Creates MATERIALIZED or ALIAS field - :param inner_field: Field subclass this field is acting like - """ - assert isinstance(inner_field, Field), "field must be Field subclass" - self.class_default = inner_field.class_default - self.default = inner_field.default - super(RelativeField, self).__init__() - self.inner_field = inner_field - - def to_python(self, value): - return self.inner_field.to_python(value) - - def validate(self, value): - return self.inner_field.validate(value) - - def to_db_string(self, value, quote=True): - return self.inner_field.to_db_string(value, quote=quote) - - -class MaterializedField(RelativeField): - """ - Creates ClickHouse MATERIALIZED field. It doesn't contain real data in database, it is counted on the spot - https://clickhouse.yandex/reference_en.html#Default values - """ - - def __init__(self, inner_field, code): - """ - Creates MATERIALIZED field - :param inner_field: Field subclass this field is acting like - :param code: ClickHouse code to execute when materialized field is called. See ClickHouse docs. - """ - super(MaterializedField, self).__init__(inner_field) - - self._code = code - - def get_sql(self, with_default=True): - """ - Generates SQL for create table command - :param with_default: This flag is inherited from Field model. Does nothing (MATERIALIZED have no default) - :return: Creation SQL string - """ - return '%s MATERIALIZED %s' % (self.inner_field.db_type, self._code) - - -class AliasField(RelativeField): - """ - Creates ClickHouse ALIAS field. It doesn't contain real data in database, only copies other one - https://clickhouse.yandex/reference_en.html#Default values - """ - - def __init__(self, inner_field, base_field_name): - """ - Creates ALIAS field - :param inner_field: Field instance this field is acting like - :param base_field_name: Name of field, to which alias is built - """ - super(AliasField, self).__init__(inner_field) - self.base_field_name = base_field_name - - def get_sql(self, with_default=True): - """ - Generates SQL for create table command - :param with_default: This flag is inherited from Field model. Does nothing (ALIAS have no default) - :return: Creation SQL string - """ - return '%s ALIAS %s' % (self.inner_field.db_type, self.base_field_name) diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index 2e7f836..16f6f77 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -157,7 +157,7 @@ class Model(with_metaclass(ModelBase)): ''' data = self.__dict__ - fields = [f for f in self._fields if f[1].insertable] if insertable_only else self._fields + fields = [f for f in self._fields if f[1].is_insertable()] if insertable_only else self._fields return '\t'.join(field.to_db_string(data[name], quote=False) for name, field in fields) diff --git a/tests/test_alias_fields.py b/tests/test_alias_fields.py index 87a54e6..af7bbc8 100644 --- a/tests/test_alias_fields.py +++ b/tests/test_alias_fields.py @@ -43,15 +43,27 @@ class MaterializedFieldsTest(unittest.TestCase): with self.assertRaises(ValueError): instance.alias_date = value + def test_wrong_field(self): + with self.assertRaises(AssertionError): + StringField(alias=123) + + def test_duplicate_default(self): + with self.assertRaises(AssertionError): + StringField(alias='str_field', default='with default') + + with self.assertRaises(AssertionError): + StringField(alias='str_field', materialized='str_field') + class ModelWithAliasFields(Model): int_field = Int32Field() date_field = DateField() str_field = StringField() - alias_str = AliasField(StringField(), 'str_field') - alias_int = MaterializedField(Int32Field(), 'int_field') - alias_date = MaterializedField(DateField(), 'date_field') + alias_str = StringField(alias='str_field') + alias_int = Int32Field(alias='int_field') + alias_date = DateField(alias='date_field') engine = MergeTree('date_field', ('date_field',)) + diff --git a/tests/test_materialized_fields.py b/tests/test_materialized_fields.py index 855e5fd..3151dc3 100644 --- a/tests/test_materialized_fields.py +++ b/tests/test_materialized_fields.py @@ -43,15 +43,27 @@ class MaterializedFieldsTest(unittest.TestCase): with self.assertRaises(ValueError): instance.mat_date = value + def test_wrong_field(self): + with self.assertRaises(AssertionError): + StringField(materialized=123) + + def test_duplicate_default(self): + with self.assertRaises(AssertionError): + StringField(materialized='str_field', default='with default') + + with self.assertRaises(AssertionError): + StringField(materialized='str_field', alias='str_field') + class ModelWithMaterializedFields(Model): int_field = Int32Field() date_time_field = DateTimeField() str_field = StringField() - mat_str = MaterializedField(StringField(), 'lower(str_field)') - mat_int = MaterializedField(Int32Field(), 'abs(int_field)') - mat_date = MaterializedField(DateField(), 'toDate(date_time_field)') + mat_str = StringField(materialized='lower(str_field)') + mat_int = Int32Field(materialized='abs(int_field)') + mat_date = DateField(materialized='toDate(date_time_field)') engine = MergeTree('mat_date', ('mat_date',)) + diff --git a/tests/test_migrations.py b/tests/test_migrations.py index 0ccf425..4541a6b 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -140,7 +140,7 @@ class EnumModel2(Model): class MaterializedModel(Model): date_time = DateTimeField() - date = MaterializedField(DateField(), 'toDate(date_time)') + date = DateField(materialized='toDate(date_time)') engine = MergeTree('date', ('date',)) @@ -151,7 +151,7 @@ class MaterializedModel(Model): class AliasModel(Model): date = DateField() - date_alias = AliasField(DateField(), 'date') + date_alias = DateField(alias='date') engine = MergeTree('date', ('date',)) From 6c4640bb24f71e0109c7d0c808afc1fa56cd2d62 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 7 Feb 2017 12:20:47 +0200 Subject: [PATCH 05/11] TRIVIAL code style --- src/infi/clickhouse_orm/models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index 16f6f77..a5c16df 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -156,8 +156,9 @@ class Model(with_metaclass(ModelBase)): :param bool insertable_only: If True, returns only fields, that can be inserted into database ''' data = self.__dict__ - - fields = [f for f in self._fields if f[1].is_insertable()] if insertable_only else self._fields + fields = self._fields + if insertable_only: + fields = [f for f in fields if f[1].is_insertable()] return '\t'.join(field.to_db_string(data[name], quote=False) for name, field in fields) From a73a69ef523037c3777ecb777a7af5e3332dc67a Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 7 Feb 2017 12:21:16 +0200 Subject: [PATCH 06/11] TRIVIAL add note about coverage --- README.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 70ab8af..432519c 100644 --- a/README.rst +++ b/README.rst @@ -327,4 +327,8 @@ After cloning the project, run the following commands:: To run the tests, ensure that the ClickHouse server is running on http://localhost:8123/ (this is the default), and run:: - bin/nosetests \ No newline at end of file + bin/nosetests + +To see test coverage information run:: + + bin/nosetests --with-coverage --cover-package=infi.clickhouse_orm From f29d737f29c0643216ca2a5f84ce0ab1c359812e Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 7 Feb 2017 15:25:16 +0200 Subject: [PATCH 07/11] Always keep datetime fields in UTC internally, and convert server timezone to UTC when parsing query results. --- src/infi/clickhouse_orm/database.py | 16 +++++++-- src/infi/clickhouse_orm/fields.py | 35 ++++++++++--------- src/infi/clickhouse_orm/models.py | 8 +++-- tests/test_simple_fields.py | 53 +++++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 21 deletions(-) create mode 100644 tests/test_simple_fields.py diff --git a/src/infi/clickhouse_orm/database.py b/src/infi/clickhouse_orm/database.py index 73bbc13..7b4b398 100644 --- a/src/infi/clickhouse_orm/database.py +++ b/src/infi/clickhouse_orm/database.py @@ -4,9 +4,12 @@ from .models import ModelBase from .utils import escape, parse_tsv, import_submodules from math import ceil import datetime -import logging from string import Template from six import PY3, string_types +import pytz + +import logging +logger = logging.getLogger('clickhouse_orm') Page = namedtuple('Page', 'objects number_of_objects pages_total number page_size') @@ -26,6 +29,7 @@ class Database(object): self.readonly = readonly if not self.readonly: self.create_database() + self.server_timezone = self._get_server_timezone() def create_database(self): self._send('CREATE DATABASE IF NOT EXISTS `%s`' % self.db_name) @@ -82,7 +86,7 @@ class Database(object): field_types = parse_tsv(next(lines)) model_class = model_class or ModelBase.create_ad_hoc_model(zip(field_names, field_types)) for line in lines: - yield model_class.from_tsv(line, field_names) + yield model_class.from_tsv(line, field_names, self.server_timezone) def paginate(self, model_class, order_by, page_num=1, page_size=100, conditions=None, settings=None): count = self.count(model_class, conditions) @@ -154,3 +158,11 @@ class Database(object): mapping['table'] = "`%s`.`%s`" % (self.db_name, model_class.table_name()) query = Template(query).substitute(mapping) return query + + def _get_server_timezone(self): + try: + r = self._send('SELECT timezone()') + return pytz.timezone(r.text.strip()) + except DatabaseException: + logger.exception('Cannot determine server timezone, assuming UTC') + return pytz.utc diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 51b67de..e4115e8 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -2,6 +2,7 @@ from six import string_types, text_type, binary_type import datetime import pytz import time +from calendar import timegm from .utils import escape, parse_array @@ -24,10 +25,11 @@ class Field(object): self.alias = alias self.materialized = materialized - def to_python(self, value): + def to_python(self, value, timezone_in_use): ''' Converts the input value into the expected Python data type, raising ValueError if the data can't be converted. Returns the converted value. Subclasses should override this. + The timezone_in_use parameter should be consulted when parsing datetime fields. ''' return value @@ -77,7 +79,7 @@ class StringField(Field): class_default = '' db_type = 'String' - def to_python(self, value): + def to_python(self, value, timezone_in_use): if isinstance(value, text_type): return value if isinstance(value, binary_type): @@ -92,11 +94,11 @@ class DateField(Field): class_default = min_value db_type = 'Date' - def to_python(self, value): - if isinstance(value, datetime.date): - return value + def to_python(self, value, timezone_in_use): if isinstance(value, datetime.datetime): return value.date() + if isinstance(value, datetime.date): + return value if isinstance(value, int): return DateField.class_default + datetime.timedelta(days=value) if isinstance(value, string_types): @@ -117,26 +119,27 @@ class DateTimeField(Field): class_default = datetime.datetime.fromtimestamp(0, pytz.utc) db_type = 'DateTime' - def to_python(self, value): + def to_python(self, value, timezone_in_use): if isinstance(value, datetime.datetime): - return value + return value.astimezone(pytz.utc) if value.tzinfo else value.replace(tzinfo=pytz.utc) if isinstance(value, datetime.date): - return datetime.datetime(value.year, value.month, value.day) + return datetime.datetime(value.year, value.month, value.day, tzinfo=pytz.utc) if isinstance(value, int): - return datetime.datetime.fromtimestamp(value, pytz.utc) + return datetime.datetime.utcfromtimestamp(value).replace(tzinfo=pytz.utc) if isinstance(value, string_types): if value == '0000-00-00 00:00:00': return self.class_default - return datetime.datetime.strptime(value, '%Y-%m-%d %H:%M:%S') + dt = datetime.datetime.strptime(value, '%Y-%m-%d %H:%M:%S') + return timezone_in_use.localize(dt).astimezone(pytz.utc) raise ValueError('Invalid value for %s - %r' % (self.__class__.__name__, value)) def to_db_string(self, value, quote=True): - return escape(int(time.mktime(value.timetuple())), quote) + return escape(timegm(value.utctimetuple()), quote) class BaseIntField(Field): - def to_python(self, value): + def to_python(self, value, timezone_in_use): try: return int(value) except: @@ -204,7 +207,7 @@ class Int64Field(BaseIntField): class BaseFloatField(Field): - def to_python(self, value): + def to_python(self, value, timezone_in_use): try: return float(value) except: @@ -229,7 +232,7 @@ class BaseEnumField(Field): default = list(enum_cls)[0] super(BaseEnumField, self).__init__(default, alias, materialized) - def to_python(self, value): + def to_python(self, value, timezone_in_use): if isinstance(value, self.enum_cls): return value try: @@ -291,14 +294,14 @@ class ArrayField(Field): self.inner_field = inner_field super(ArrayField, self).__init__(default, alias, materialized) - def to_python(self, value): + def to_python(self, value, timezone_in_use): if isinstance(value, text_type): value = parse_array(value) elif isinstance(value, binary_type): value = parse_array(value.decode('UTF-8')) elif not isinstance(value, (list, tuple)): raise ValueError('ArrayField expects list or tuple, not %s' % type(value)) - return [self.inner_field.to_python(v) for v in value] + return [self.inner_field.to_python(v, timezone_in_use) for v in value] def validate(self, value): for v in value: diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index a5c16df..444e32e 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -3,6 +3,7 @@ from .engines import * from .fields import Field from six import with_metaclass +import pytz from logging import getLogger logger = getLogger('clickhouse_orm') @@ -96,7 +97,7 @@ class Model(with_metaclass(ModelBase)): ''' field = self.get_field(name) if field: - value = field.to_python(value) + value = field.to_python(value, pytz.utc) field.validate(value) super(Model, self).__setattr__(name, value) @@ -136,7 +137,7 @@ class Model(with_metaclass(ModelBase)): return 'DROP TABLE IF EXISTS `%s`.`%s`' % (db_name, cls.table_name()) @classmethod - def from_tsv(cls, line, field_names=None): + def from_tsv(cls, line, field_names=None, timezone_in_use=pytz.utc): ''' Create a model instance from a tab-separated line. The line may or may not include a newline. The field_names list must match the fields defined in the model, but does not have to include all of them. @@ -147,7 +148,8 @@ class Model(with_metaclass(ModelBase)): values = iter(parse_tsv(line)) kwargs = {} for name in field_names: - kwargs[name] = next(values) + field = getattr(cls, name) + kwargs[name] = field.to_python(next(values), timezone_in_use) return cls(**kwargs) def to_tsv(self, insertable_only=False): diff --git a/tests/test_simple_fields.py b/tests/test_simple_fields.py new file mode 100644 index 0000000..c955574 --- /dev/null +++ b/tests/test_simple_fields.py @@ -0,0 +1,53 @@ +import unittest +from infi.clickhouse_orm.fields import * +from datetime import date, datetime +import pytz + + +class SimpleFieldsTest(unittest.TestCase): + + def test_date_field(self): + f = DateField() + # Valid values + for value in (date(1970, 1, 1), datetime(1970, 1, 1), '1970-01-01', '0000-00-00', 0): + self.assertEquals(f.to_python(value, pytz.utc), date(1970, 1, 1)) + # Invalid values + for value in ('nope', '21/7/1999', 0.5): + with self.assertRaises(ValueError): + f.to_python(value, pytz.utc) + # Range check + for value in (date(1900, 1, 1), date(2900, 1, 1)): + with self.assertRaises(ValueError): + f.validate(value) + + def test_datetime_field(self): + f = DateTimeField() + epoch = datetime(1970, 1, 1, tzinfo=pytz.utc) + # Valid values + for value in (date(1970, 1, 1), datetime(1970, 1, 1), epoch, + epoch.astimezone(pytz.timezone('US/Eastern')), epoch.astimezone(pytz.timezone('Asia/Jerusalem')), + '1970-01-01 00:00:00', '0000-00-00 00:00:00', 0): + dt = f.to_python(value, pytz.utc) + self.assertEquals(dt.tzinfo, pytz.utc) + self.assertEquals(dt, epoch) + # Verify that conversion to and from db string does not change value + dt2 = f.to_python(int(f.to_db_string(dt)), pytz.utc) + self.assertEquals(dt, dt2) + # Invalid values + for value in ('nope', '21/7/1999', 0.5): + with self.assertRaises(ValueError): + f.to_python(value, pytz.utc) + + def test_uint8_field(self): + f = UInt8Field() + # Valid values + for value in (17, '17', 17.0): + self.assertEquals(f.to_python(value, pytz.utc), 17) + # Invalid values + for value in ('nope', date.today()): + with self.assertRaises(ValueError): + f.to_python(value, pytz.utc) + # Range check + for value in (-1, 1000): + with self.assertRaises(ValueError): + f.validate(value) \ No newline at end of file From f22073e2e6f7d26bfb40ed59f4e0a57699b07807 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 7 Feb 2017 16:22:04 +0200 Subject: [PATCH 08/11] Added change log --- CHANGELOG.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 CHANGELOG.rst diff --git a/CHANGELOG.rst b/CHANGELOG.rst new file mode 100644 index 0000000..59d97c0 --- /dev/null +++ b/CHANGELOG.rst @@ -0,0 +1,27 @@ +Change Log +========== + +[Unreleased] +------------ +- Always keep datetime fields in UTC internally, and convert server timezone to UTC when parsing query results +- Support for ALIAS and MATERIALIZED fields (M1ha) +- Pagination: passing -1 as the page number now returns the last page +- Accept datetime values for date fields (Zloool) +- Support readonly mode in Database class (tswr) + +v0.7.1 +------ +- Accept '0000-00-00 00:00:00' as a datetime value (tsionyx) +- Bug fix: parse_array fails on int arrays +- Improve performance when inserting many rows + +v0.7.0 +------ +- Support array fields +- Support enum fields + +v0.6.3 +------ +- Python 3 support + + From 5b03e660486c30222583311dabbc5b170019b5b4 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 7 Feb 2017 16:36:41 +0200 Subject: [PATCH 09/11] Send readonly=1 when database is created in readonly mode --- src/infi/clickhouse_orm/database.py | 2 ++ tests/test_database.py | 14 +++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/infi/clickhouse_orm/database.py b/src/infi/clickhouse_orm/database.py index 7b4b398..43ca36e 100644 --- a/src/infi/clickhouse_orm/database.py +++ b/src/infi/clickhouse_orm/database.py @@ -146,6 +146,8 @@ class Database(object): params['user'] = self.username if self.password: params['password'] = self.password + if self.readonly: + params['readonly'] = '1' return params def _substitute(self, query, model_class=None): diff --git a/tests/test_database.py b/tests/test_database.py index 30c25d3..1e62472 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -2,7 +2,7 @@ import unittest -from infi.clickhouse_orm.database import Database +from infi.clickhouse_orm.database import Database, DatabaseException from infi.clickhouse_orm.models import Model from infi.clickhouse_orm.fields import * from infi.clickhouse_orm.engines import * @@ -117,6 +117,18 @@ class DatabaseTestCase(unittest.TestCase): p = list(self.database.select("SELECT * from $table", Person))[0] self.assertEquals(p.first_name, s) + def test_readonly(self): + orig_database = self.database + self.database = Database(orig_database.db_name, readonly=True) + with self.assertRaises(DatabaseException): + self._insert_and_check(self._sample_data(), len(data)) + self.assertEquals(self.database.count(Person), 0) + with self.assertRaises(DatabaseException): + self.database.drop_table(Person) + with self.assertRaises(DatabaseException): + self.database.drop_database() + self.database = orig_database + def _sample_data(self): for entry in data: yield Person(**entry) From b95046893be68424f4fc3aa21a5e088bdb534b67 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 7 Feb 2017 17:14:18 +0200 Subject: [PATCH 10/11] Add documentation about timezone handling --- README.rst | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/README.rst b/README.rst index 432519c..891fc3f 100644 --- a/README.rst +++ b/README.rst @@ -210,7 +210,25 @@ Float64Field Float64 float Enum8Field Enum8 Enum See below Enum16Field Enum16 Enum See below ArrayField Array list See below -=================== ========== ================= =================================================== +=================== ======== ================= =================================================== + +DateTimeField and Time Zones +**************************** + +A ``DateTimeField`` can be assigned values from one of the following types: + +- datetime +- date +- integer - number of seconds since the Unix epoch +- string in ``YYYY-MM-DD HH:MM:SS`` format + +The assigned value always gets converted to a timezone-aware ``datetime`` in UTC. If the assigned +value is a timezone-aware ``datetime`` in another timezone, it will be converted to UTC. Otherwise, the assigned value is assumed to already be in UTC. + +DateTime values that are read from the database are also converted to UTC. ClickHouse formats them according to the +timezone of the server, and the ORM makes the necessary conversions. This requires a ClickHouse version which is new +enough to support the ``timezone()`` function, otherwise it is assumed to be using UTC. In any case, we recommend +settings the server timezone to UTC in order to prevent confusion. Working with enum fields ************************ @@ -255,15 +273,12 @@ You can create array fields containing any data type, for example:: Working with materialized and alias fields ****************************************** -ClickHouse provides an opportunity to create MATERIALIZED and ALIAS Fields. +ClickHouse provides an opportunity to create MATERIALIZED and ALIAS fields. +See documentation `here `_. -See documentation `here `. - -Both field types can't be inserted into database directly. -These field values are ignored, when using database.insert() method. -These fields are set to default values if you use database.select('SELECT * FROM mymodel', model_class=MyModel), -because ClickHouse doesn't return them. -Nevertheless, attribute values (as well as defaults) can be set for model object from python. +Both field types can't be inserted into the database directly, so they are ignored when using the ``Database.insert()`` method. +ClickHouse does not return the field values if you use ``"SELECT * FROM ..."`` - you have to list these field +names explicitly in the query. Usage:: @@ -281,7 +296,7 @@ Usage:: db.insert([obj]) # All values will be retrieved from database db.select('SELECT created, created_date, username, name FROM $db.event', model_class=Event) - # created_date, username will contain default value + # created_date and username will contain a default value db.select('SELECT * FROM $db.event', model_class=Event) From dec45a0436d9c0d8aba5ab381a9235e7ca752faf Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Tue, 7 Feb 2017 17:24:54 +0200 Subject: [PATCH 11/11] Document the Database.readonly option --- README.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 891fc3f..7dba0f9 100644 --- a/README.rst +++ b/README.rst @@ -31,8 +31,8 @@ Models are defined in a way reminiscent of Django's ORM:: engine = engines.MergeTree('birthday', ('first_name', 'last_name', 'birthday')) It is possible to provide a default value for a field, instead of its "natural" default (empty string for string fields, zero for numeric fields etc.). -It is always possible to pass alias or materialized parameters. See below for usage examples. -Only one of default, alias and materialized parameters can be provided +Alternatively it is possible to pass alias or materialized parameters (see below for usage examples). +Only one of ``default``, ``alias`` and ``materialized`` parameters can be provided. See below for the supported field types and table engines. @@ -92,6 +92,11 @@ Using the ``Database`` instance you can create a table for your model, and inser The ``insert`` method can take any iterable of model instances, but they all must belong to the same model class. +Creating a read-only database is also supported. Such a ``Database`` instance can only read data, and cannot +modify data or schemas:: + + db = Database('my_test_db', readonly=True) + Reading from the Database -------------------------