From ec99044fab128a3a208e9671ee9e6d0e4af146c0 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Wed, 5 Apr 2017 17:09:56 +0300 Subject: [PATCH 1/5] Greatly improve performance when inserting large strings (credit to M1hacka for identifying the problem) --- CHANGELOG.rst | 1 + src/infi/clickhouse_orm/utils.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 878d8a3..6484879 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,7 @@ Unreleased - Add support for ReplacingMergeTree (leenr) - Fix problem with SELECT WITH TOTALS (pilosus) - Update serialization format of DateTimeField to 10 digits, zero padded (nikepan) +- Greatly improve performance when inserting large strings (credit to M1hacka for identifying the problem) v0.8.0 ------ diff --git a/src/infi/clickhouse_orm/utils.py b/src/infi/clickhouse_orm/utils.py index f5b5b22..83d11e0 100644 --- a/src/infi/clickhouse_orm/utils.py +++ b/src/infi/clickhouse_orm/utils.py @@ -17,15 +17,18 @@ SPECIAL_CHARS = { SPECIAL_CHARS_REGEX = re.compile("[" + ''.join(SPECIAL_CHARS.values()) + "]") + def escape(value, quote=True): ''' If the value is a string, escapes any special characters and optionally surrounds it with single quotes. If the value is not a string (e.g. a number), converts it to one. ''' + def escape_one(match): + return SPECIAL_CHARS[match.group(0)] + if isinstance(value, string_types): - if SPECIAL_CHARS_REGEX.search(value): - value = "".join(SPECIAL_CHARS.get(c, c) for c in value) + value = SPECIAL_CHARS_REGEX.sub(escape_one, value) if quote: value = "'" + value + "'" return text_type(value) From 21907966b2a4217cab3f09aa4d0971d56495ddd9 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Wed, 5 Apr 2017 17:42:42 +0300 Subject: [PATCH 2/5] Reduce memory footprint of Database.insert() --- CHANGELOG.rst | 1 + src/infi/clickhouse_orm/database.py | 24 +++++++++++++++--------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6484879..0bb54a3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,7 @@ Unreleased - Fix problem with SELECT WITH TOTALS (pilosus) - Update serialization format of DateTimeField to 10 digits, zero padded (nikepan) - Greatly improve performance when inserting large strings (credit to M1hacka for identifying the problem) +- Reduce memory footprint of Database.insert() v0.8.0 ------ diff --git a/src/infi/clickhouse_orm/database.py b/src/infi/clickhouse_orm/database.py index 3afd1c3..7226f82 100644 --- a/src/infi/clickhouse_orm/database.py +++ b/src/infi/clickhouse_orm/database.py @@ -50,6 +50,7 @@ class Database(object): def insert(self, model_instances, batch_size=1000): from six import next + from cStringIO import StringIO i = iter(model_instances) try: first_instance = next(i) @@ -61,22 +62,27 @@ class Database(object): raise DatabaseException("You can't insert into read only table") def gen(): - yield self._substitute('INSERT INTO $table FORMAT TabSeparated\n', model_class).encode('utf-8') + buf = StringIO() + buf.write(self._substitute('INSERT INTO $table FORMAT TabSeparated\n', model_class).encode('utf-8')) first_instance.set_database(self) - yield (first_instance.to_tsv(include_readonly=False) + '\n').encode('utf-8') + buf.write(first_instance.to_tsv(include_readonly=False).encode('utf-8')) + buf.write('\n') # Collect lines in batches of batch_size - batch = [] + lines = 2 for instance in i: instance.set_database(self) - batch.append(instance.to_tsv(include_readonly=False)) - if len(batch) >= batch_size: + buf.write(instance.to_tsv(include_readonly=False).encode('utf-8')) + buf.write('\n') + lines += 1 + if lines >= batch_size: # Return the current batch of lines - yield ('\n'.join(batch) + '\n').encode('utf-8') + yield buf.getvalue() # Start a new batch - batch = [] + buf = StringIO() + lines = 0 # Return any remaining lines in partial batch - if batch: - yield ('\n'.join(batch) + '\n').encode('utf-8') + if lines: + yield buf.getvalue() self._send(gen()) def count(self, model_class, conditions=None): From c5a9b16eac7bf0ac07655d5b76690393e2000228 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Wed, 5 Apr 2017 18:19:12 +0300 Subject: [PATCH 3/5] Performance improvement: skip utils.escape for numeric fields --- src/infi/clickhouse_orm/fields.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 915ebee..f7d3992 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -154,6 +154,11 @@ class BaseIntField(Field): except: raise ValueError('Invalid value for %s - %r' % (self.__class__.__name__, value)) + def to_db_string(self, value, quote=True): + # There's no need to call escape since numbers do not contain + # special characters, and never need quoting + return text_type(value) + def validate(self, value): self._range_check(value, self.min_value, self.max_value) @@ -222,6 +227,11 @@ class BaseFloatField(Field): except: raise ValueError('Invalid value for %s - %r' % (self.__class__.__name__, value)) + def to_db_string(self, value, quote=True): + # There's no need to call escape since numbers do not contain + # special characters, and never need quoting + return text_type(value) + class Float32Field(BaseFloatField): From dacf88adbfa34f73d9cc780684875ccf91adb961 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Wed, 5 Apr 2017 18:20:01 +0300 Subject: [PATCH 4/5] Performance improvement: build a _writable_fields list once instead of calculating it every time it's needed --- src/infi/clickhouse_orm/models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index a6163f7..44437a9 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -27,6 +27,7 @@ class ModelBase(type): fields = base_fields + [item for item in attrs.items() if isinstance(item[1], Field)] fields.sort(key=lambda item: item[1].creation_counter) setattr(new_cls, '_fields', fields) + setattr(new_cls, '_writable_fields', [f for f in fields if not f[1].readonly]) return new_cls @classmethod @@ -186,7 +187,7 @@ class Model(with_metaclass(ModelBase)): :param bool include_readonly: If False, returns only fields, that can be inserted into database ''' data = self.__dict__ - fields = self._fields if include_readonly else [f for f in self._fields if not f[1].readonly] + fields = self._fields if include_readonly else self._writable_fields return '\t'.join(field.to_db_string(data[name], quote=False) for name, field in fields) def to_dict(self, include_readonly=True, field_names=None): @@ -195,7 +196,7 @@ class Model(with_metaclass(ModelBase)): :param bool include_readonly: If False, returns only fields, that can be inserted into database :param field_names: An iterable of field names to return ''' - fields = self._fields if include_readonly else [f for f in self._fields if not f[1].readonly] + fields = self._fields if include_readonly else self._writable_fields if field_names is not None: fields = [f for f in fields if f[0] in field_names] From 52f11319adb11ee7710e7742857fc6aebf3f8ed2 Mon Sep 17 00:00:00 2001 From: Itai Shirav Date: Wed, 5 Apr 2017 18:37:14 +0300 Subject: [PATCH 5/5] Releasing v0.8.1 --- CHANGELOG.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0bb54a3..4579012 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,8 +1,8 @@ Change Log ========== -Unreleased ----------- +v0.8.1 +------ - Add support for ReplacingMergeTree (leenr) - Fix problem with SELECT WITH TOTALS (pilosus) - Update serialization format of DateTimeField to 10 digits, zero padded (nikepan)