From 66f8e8a4aefd6efcbe53c06f5ed6d6983d7e6e2e Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Thu, 15 Sep 2016 11:32:07 +0300 Subject: [PATCH] Performance improvements when inserting escape - check first if there are any special chars before replacing send lines in batches Use list comprehension in to_tsv --- src/infi/clickhouse_orm/database.py | 18 +++++++++++++----- src/infi/clickhouse_orm/models.py | 9 ++------- src/infi/clickhouse_orm/utils.py | 8 ++++++-- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/infi/clickhouse_orm/database.py b/src/infi/clickhouse_orm/database.py index 23f115b..b8e4774 100644 --- a/src/infi/clickhouse_orm/database.py +++ b/src/infi/clickhouse_orm/database.py @@ -35,7 +35,7 @@ class Database(object): def drop_database(self): self._send('DROP DATABASE `%s`' % self.db_name) - def insert(self, model_instances): + def insert(self, model_instances, batch_size=1000): from six import next i = iter(model_instances) try: @@ -45,11 +45,19 @@ class Database(object): model_class = first_instance.__class__ def gen(): yield self._substitute('INSERT INTO $table FORMAT TabSeparated\n', model_class).encode('utf-8') - yield first_instance.to_tsv().encode('utf-8') - yield '\n'.encode('utf-8') + yield (first_instance.to_tsv() + '\n').encode('utf-8') + # Collect lines in batches of batch_size + batch = [] for instance in i: - yield instance.to_tsv().encode('utf-8') - yield '\n'.encode('utf-8') + batch.append(instance.to_tsv()) + if len(batch) >= batch_size: + # Return the current batch of lines + yield ('\n'.join(batch) + '\n').encode('utf-8') + # Start a new batch + batch = [] + # Return any remaining lines in partial batch + if batch: + yield ('\n'.join(batch) + '\n').encode('utf-8') self._send(gen()) def count(self, model_class, conditions=None): diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index a077366..6fae876 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -154,10 +154,5 @@ class Model(with_metaclass(ModelBase)): ''' Returns the instance's column values as a tab-separated line. A newline is not included. ''' - parts = [] - for name, field in self._fields: - value = field.to_db_string(getattr(self, name), quote=False) - parts.append(value) - tsv = '\t'.join(parts) - logger.debug(tsv) - return tsv + data = self.__dict__ + return '\t'.join(field.to_db_string(data[name], quote=False) for name, field in self._fields) diff --git a/src/infi/clickhouse_orm/utils.py b/src/infi/clickhouse_orm/utils.py index bcbaf90..e7d6521 100644 --- a/src/infi/clickhouse_orm/utils.py +++ b/src/infi/clickhouse_orm/utils.py @@ -14,6 +14,8 @@ SPECIAL_CHARS = { "'" : "\\'" } +SPECIAL_CHARS_REGEX = re.compile("[" + ''.join(SPECIAL_CHARS.values()) + "]") + def escape(value, quote=True): ''' @@ -22,8 +24,10 @@ def escape(value, quote=True): converts it to one. ''' if isinstance(value, string_types): - chars = (SPECIAL_CHARS.get(c, c) for c in value) - value = "'" + "".join(chars) + "'" if quote else "".join(chars) + if SPECIAL_CHARS_REGEX.search(value): + value = "".join(SPECIAL_CHARS.get(c, c) for c in value) + if quote: + value = "'" + value + "'" return text_type(value)