From 66f8e8a4aefd6efcbe53c06f5ed6d6983d7e6e2e Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Thu, 15 Sep 2016 11:32:07 +0300 Subject: [PATCH 1/3] Performance improvements when inserting escape - check first if there are any special chars before replacing send lines in batches Use list comprehension in to_tsv --- src/infi/clickhouse_orm/database.py | 18 +++++++++++++----- src/infi/clickhouse_orm/models.py | 9 ++------- src/infi/clickhouse_orm/utils.py | 8 ++++++-- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/infi/clickhouse_orm/database.py b/src/infi/clickhouse_orm/database.py index 23f115b..b8e4774 100644 --- a/src/infi/clickhouse_orm/database.py +++ b/src/infi/clickhouse_orm/database.py @@ -35,7 +35,7 @@ class Database(object): def drop_database(self): self._send('DROP DATABASE `%s`' % self.db_name) - def insert(self, model_instances): + def insert(self, model_instances, batch_size=1000): from six import next i = iter(model_instances) try: @@ -45,11 +45,19 @@ class Database(object): model_class = first_instance.__class__ def gen(): yield self._substitute('INSERT INTO $table FORMAT TabSeparated\n', model_class).encode('utf-8') - yield first_instance.to_tsv().encode('utf-8') - yield '\n'.encode('utf-8') + yield (first_instance.to_tsv() + '\n').encode('utf-8') + # Collect lines in batches of batch_size + batch = [] for instance in i: - yield instance.to_tsv().encode('utf-8') - yield '\n'.encode('utf-8') + batch.append(instance.to_tsv()) + if len(batch) >= batch_size: + # Return the current batch of lines + yield ('\n'.join(batch) + '\n').encode('utf-8') + # Start a new batch + batch = [] + # Return any remaining lines in partial batch + if batch: + yield ('\n'.join(batch) + '\n').encode('utf-8') self._send(gen()) def count(self, model_class, conditions=None): diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index a077366..6fae876 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -154,10 +154,5 @@ class Model(with_metaclass(ModelBase)): ''' Returns the instance's column values as a tab-separated line. A newline is not included. ''' - parts = [] - for name, field in self._fields: - value = field.to_db_string(getattr(self, name), quote=False) - parts.append(value) - tsv = '\t'.join(parts) - logger.debug(tsv) - return tsv + data = self.__dict__ + return '\t'.join(field.to_db_string(data[name], quote=False) for name, field in self._fields) diff --git a/src/infi/clickhouse_orm/utils.py b/src/infi/clickhouse_orm/utils.py index bcbaf90..e7d6521 100644 --- a/src/infi/clickhouse_orm/utils.py +++ b/src/infi/clickhouse_orm/utils.py @@ -14,6 +14,8 @@ SPECIAL_CHARS = { "'" : "\\'" } +SPECIAL_CHARS_REGEX = re.compile("[" + ''.join(SPECIAL_CHARS.values()) + "]") + def escape(value, quote=True): ''' @@ -22,8 +24,10 @@ def escape(value, quote=True): converts it to one. ''' if isinstance(value, string_types): - chars = (SPECIAL_CHARS.get(c, c) for c in value) - value = "'" + "".join(chars) + "'" if quote else "".join(chars) + if SPECIAL_CHARS_REGEX.search(value): + value = "".join(SPECIAL_CHARS.get(c, c) for c in value) + if quote: + value = "'" + value + "'" return text_type(value) From 685e3dffe96e648459a6f5a504388de9e929859b Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Thu, 15 Sep 2016 12:03:41 +0300 Subject: [PATCH 2/3] Bug fix - parse_array fails on int arrays --- src/infi/clickhouse_orm/utils.py | 4 ++-- tests/test_array_fields.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/infi/clickhouse_orm/utils.py b/src/infi/clickhouse_orm/utils.py index e7d6521..c24a93d 100644 --- a/src/infi/clickhouse_orm/utils.py +++ b/src/infi/clickhouse_orm/utils.py @@ -73,8 +73,8 @@ def parse_array(array_string): else: # Start of non-quoted value, find its end match = re.search(r",|\]", array_string) - values.append(array_string[1 : match.start() + 1]) - array_string = array_string[match.end():] + values.append(array_string[0 : match.start()]) + array_string = array_string[match.end() - 1:] def import_submodules(package_name): diff --git a/tests/test_array_fields.py b/tests/test_array_fields.py index ef3c3c1..ba4e94b 100644 --- a/tests/test_array_fields.py +++ b/tests/test_array_fields.py @@ -46,6 +46,21 @@ class ArrayFieldsTest(unittest.TestCase): with self.assertRaises(ValueError): instance.arr_int = value + def test_parse_array(self): + from infi.clickhouse_orm.utils import parse_array, unescape + self.assertEquals(parse_array("[]"), []) + self.assertEquals(parse_array("[1, 2, 395, -44]"), ["1", "2", "395", "-44"]) + self.assertEquals(parse_array("['big','mouse','','!']"), ["big", "mouse", "", "!"]) + self.assertEquals(parse_array(unescape("['\\r\\n\\0\\t\\b']")), ["\r\n\0\t\b"]) + for s in ("", + "[", + "]", + "[1, 2", + "3, 4]", + "['aaa', 'aaa]"): + with self.assertRaises(ValueError): + parse_array(s) + class ModelWithArrays(Model): From b2cf8f42451cb70c997b551678527f94b22947c0 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Thu, 29 Sep 2016 11:24:22 +0300 Subject: [PATCH 3/3] Fixes by tsionyx --- README.rst | 4 ++-- src/infi/clickhouse_orm/fields.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 27f2d07..f093d52 100644 --- a/README.rst +++ b/README.rst @@ -228,8 +228,8 @@ You can create array fields containing any data type, for example:: class SensorData(models.Model): date = fields.DateField() - temperatures = fields.ArrayField(fields.Float32Field) - humidity_levels = fields.ArrayField(fields.UInt8Field) + temperatures = fields.ArrayField(fields.Float32Field()) + humidity_levels = fields.ArrayField(fields.UInt8Field()) engine = engines.MergeTree('date', ('date',)) diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 0406b01..fb4dab5 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -107,6 +107,8 @@ class DateTimeField(Field): if isinstance(value, int): return datetime.datetime.fromtimestamp(value, pytz.utc) if isinstance(value, string_types): + if value == '0000-00-00 00:00:00': + return self.class_default return datetime.datetime.strptime(value, '%Y-%m-%d %H:%M:%S') raise ValueError('Invalid value for %s - %r' % (self.__class__.__name__, value))