Reduce memory footprint of Database.insert()

2025-07-13 01:22:27 +03:00 · 2017-04-05 17:42:42 +03:00 · 2017-04-05 17:42:42 +03:00 · 21907966b2
commit 21907966b2
parent ec99044fab
2 changed files with 16 additions and 9 deletions
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@ -7,6 +7,7 @@ Unreleased
 - Fix problem with SELECT WITH TOTALS (pilosus)
 - Update serialization format of DateTimeField to 10 digits, zero padded (nikepan)
 - Greatly improve performance when inserting large strings (credit to M1hacka for identifying the problem)
+- Reduce memory footprint of Database.insert()

 v0.8.0
 ------
--- a/src/infi/clickhouse_orm/database.py
+++ b/src/infi/clickhouse_orm/database.py
@ -50,6 +50,7 @@ class Database(object):

    def insert(self, model_instances, batch_size=1000):
        from six import next
+        from cStringIO import StringIO
        i = iter(model_instances)
        try:
            first_instance = next(i)
@ -61,22 +62,27 @@ class Database(object):
            raise DatabaseException("You can't insert into read only table")

        def gen():
-            yield self._substitute('INSERT INTO $table FORMAT TabSeparated\n', model_class).encode('utf-8')
+            buf = StringIO()
+            buf.write(self._substitute('INSERT INTO $table FORMAT TabSeparated\n', model_class).encode('utf-8'))
            first_instance.set_database(self)
-            yield (first_instance.to_tsv(include_readonly=False) + '\n').encode('utf-8')
+            buf.write(first_instance.to_tsv(include_readonly=False).encode('utf-8'))
+            buf.write('\n')
            # Collect lines in batches of batch_size
-            batch = []
+            lines = 2
            for instance in i:
                instance.set_database(self)
-                batch.append(instance.to_tsv(include_readonly=False))
-                if len(batch) >= batch_size:
+                buf.write(instance.to_tsv(include_readonly=False).encode('utf-8'))
+                buf.write('\n')
+                lines += 1
+                if lines >= batch_size:
                    # Return the current batch of lines
-                    yield ('\n'.join(batch) + '\n').encode('utf-8')
+                    yield buf.getvalue()
                    # Start a new batch
-                    batch = []
+                    buf = StringIO()
+                    lines = 0
            # Return any remaining lines in partial batch
-            if batch:
-                yield ('\n'.join(batch) + '\n').encode('utf-8')
+            if lines:
+                yield buf.getvalue()
        self._send(gen())

    def count(self, model_class, conditions=None):