diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 878d8a3..4579012 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,11 +1,13 @@ Change Log ========== -Unreleased ----------- +v0.8.1 +------ - Add support for ReplacingMergeTree (leenr) - Fix problem with SELECT WITH TOTALS (pilosus) - Update serialization format of DateTimeField to 10 digits, zero padded (nikepan) +- Greatly improve performance when inserting large strings (credit to M1hacka for identifying the problem) +- Reduce memory footprint of Database.insert() v0.8.0 ------ diff --git a/src/infi/clickhouse_orm/database.py b/src/infi/clickhouse_orm/database.py index 3afd1c3..7226f82 100644 --- a/src/infi/clickhouse_orm/database.py +++ b/src/infi/clickhouse_orm/database.py @@ -50,6 +50,7 @@ class Database(object): def insert(self, model_instances, batch_size=1000): from six import next + from cStringIO import StringIO i = iter(model_instances) try: first_instance = next(i) @@ -61,22 +62,27 @@ class Database(object): raise DatabaseException("You can't insert into read only table") def gen(): - yield self._substitute('INSERT INTO $table FORMAT TabSeparated\n', model_class).encode('utf-8') + buf = StringIO() + buf.write(self._substitute('INSERT INTO $table FORMAT TabSeparated\n', model_class).encode('utf-8')) first_instance.set_database(self) - yield (first_instance.to_tsv(include_readonly=False) + '\n').encode('utf-8') + buf.write(first_instance.to_tsv(include_readonly=False).encode('utf-8')) + buf.write('\n') # Collect lines in batches of batch_size - batch = [] + lines = 2 for instance in i: instance.set_database(self) - batch.append(instance.to_tsv(include_readonly=False)) - if len(batch) >= batch_size: + buf.write(instance.to_tsv(include_readonly=False).encode('utf-8')) + buf.write('\n') + lines += 1 + if lines >= batch_size: # Return the current batch of lines - yield ('\n'.join(batch) + '\n').encode('utf-8') + yield buf.getvalue() # Start a new batch - batch = [] + buf = StringIO() + lines = 0 # Return any remaining lines in partial batch - if batch: - yield ('\n'.join(batch) + '\n').encode('utf-8') + if lines: + yield buf.getvalue() self._send(gen()) def count(self, model_class, conditions=None): diff --git a/src/infi/clickhouse_orm/fields.py b/src/infi/clickhouse_orm/fields.py index 915ebee..f7d3992 100644 --- a/src/infi/clickhouse_orm/fields.py +++ b/src/infi/clickhouse_orm/fields.py @@ -154,6 +154,11 @@ class BaseIntField(Field): except: raise ValueError('Invalid value for %s - %r' % (self.__class__.__name__, value)) + def to_db_string(self, value, quote=True): + # There's no need to call escape since numbers do not contain + # special characters, and never need quoting + return text_type(value) + def validate(self, value): self._range_check(value, self.min_value, self.max_value) @@ -222,6 +227,11 @@ class BaseFloatField(Field): except: raise ValueError('Invalid value for %s - %r' % (self.__class__.__name__, value)) + def to_db_string(self, value, quote=True): + # There's no need to call escape since numbers do not contain + # special characters, and never need quoting + return text_type(value) + class Float32Field(BaseFloatField): diff --git a/src/infi/clickhouse_orm/models.py b/src/infi/clickhouse_orm/models.py index a6163f7..44437a9 100644 --- a/src/infi/clickhouse_orm/models.py +++ b/src/infi/clickhouse_orm/models.py @@ -27,6 +27,7 @@ class ModelBase(type): fields = base_fields + [item for item in attrs.items() if isinstance(item[1], Field)] fields.sort(key=lambda item: item[1].creation_counter) setattr(new_cls, '_fields', fields) + setattr(new_cls, '_writable_fields', [f for f in fields if not f[1].readonly]) return new_cls @classmethod @@ -186,7 +187,7 @@ class Model(with_metaclass(ModelBase)): :param bool include_readonly: If False, returns only fields, that can be inserted into database ''' data = self.__dict__ - fields = self._fields if include_readonly else [f for f in self._fields if not f[1].readonly] + fields = self._fields if include_readonly else self._writable_fields return '\t'.join(field.to_db_string(data[name], quote=False) for name, field in fields) def to_dict(self, include_readonly=True, field_names=None): @@ -195,7 +196,7 @@ class Model(with_metaclass(ModelBase)): :param bool include_readonly: If False, returns only fields, that can be inserted into database :param field_names: An iterable of field names to return ''' - fields = self._fields if include_readonly else [f for f in self._fields if not f[1].readonly] + fields = self._fields if include_readonly else self._writable_fields if field_names is not None: fields = [f for f in fields if f[0] in field_names] diff --git a/src/infi/clickhouse_orm/utils.py b/src/infi/clickhouse_orm/utils.py index f5b5b22..83d11e0 100644 --- a/src/infi/clickhouse_orm/utils.py +++ b/src/infi/clickhouse_orm/utils.py @@ -17,15 +17,18 @@ SPECIAL_CHARS = { SPECIAL_CHARS_REGEX = re.compile("[" + ''.join(SPECIAL_CHARS.values()) + "]") + def escape(value, quote=True): ''' If the value is a string, escapes any special characters and optionally surrounds it with single quotes. If the value is not a string (e.g. a number), converts it to one. ''' + def escape_one(match): + return SPECIAL_CHARS[match.group(0)] + if isinstance(value, string_types): - if SPECIAL_CHARS_REGEX.search(value): - value = "".join(SPECIAL_CHARS.get(c, c) for c in value) + value = SPECIAL_CHARS_REGEX.sub(escape_one, value) if quote: value = "'" + value + "'" return text_type(value)