Skip empty rows in XLSX (#2028)

* Prevent empty lines in XLSX * Test for create_dataset empty rows * Update AUTHORS * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix * Add IMPORT_EXPORT_IMPORT_IGNORE_BLANK_LINES flag * Add docs * Update tests * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated docs * updated changelog * Update changelog.rst * performance improvement --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: matthewhegarty <mrhegarty@gmail.com> Co-authored-by: Matt Hegarty <matthewhegarty@users.noreply.github.com>
2025-12-13 12:43:59 +03:00 · 2025-02-07 15:50:28 +01:00 · 2025-02-07 15:50:28 +01:00 · 8a6586090b
commit 8a6586090b
parent ffe94d16d4
6 changed files with 96 additions and 4 deletions
--- a/1
+++ b/1
@ -158,3 +158,4 @@ The following is a list of much appreciated contributors:
 * 19greg96 (Gergely Karz)
 * AyushDharDubey
 * dahvo (David Mark Awad)
+* jurrian
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -5,6 +5,11 @@ Changelog

    If upgrading from v3, v4 introduces breaking changes.  Please refer to :doc:`release notes<release_notes>`.

+4.3.6 (unreleased)
+------------------
+
+- Add flag to ignore empty rows in XLSX import (`2028 <https://github.com/django-import-export/django-import-export/issues/2028>`_)
+
 4.3.5 (2025-02-01)
 ------------------

--- a/docs/faq.rst
+++ b/docs/faq.rst
@ -184,12 +184,21 @@ How to create relation during import if it does not exist

 See :ref:`creating-non-existent-relations`.

-How to handle large file uploads
---------------------------------
+How to handle large file imports
+--------------------------------

 If uploading large files, you may encounter time-outs.
 See :ref:`Using celery<celery>` and :ref:`bulk_import:Bulk imports`.

+Performance issues or unexpected behavior during import
+-------------------------------------------------------
+
+This could be due to hidden rows in Excel files.
+Hidden rows can be excluded using :ref:`import_export_import_ignore_blank_lines`.
+
+Refer to `this PR <https://github.com/django-import-export/django-import-export/pull/2028>`_ for more information.
+
+
 How to use field other than `id` in Foreign Key lookup
 ------------------------------------------------------

--- a/docs/installation.rst
+++ b/docs/installation.rst
@ -254,6 +254,16 @@ The values must be those provided in ``import_export.formats.base_formats`` e.g

 This can be set for a specific model admin by declaring the ``export_formats`` attribute.

+.. _import_export_import_ignore_blank_lines:
+
+``IMPORT_EXPORT_IMPORT_IGNORE_BLANK_LINES``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If set to ``True``, rows without content will be ignored in XSLX imports.
+This prevents an old Excel 1.0 bug which causes openpyxl ``max_rows`` to be counting all
+logical empty rows. Some editors (like LibreOffice) might add :math:`2^{20}` empty rows to the
+file, which causes a significant slowdown. By default this is ``False``.
+
 .. _exampleapp:

 Example app
--- a/import_export/formats/base_formats.py
+++ b/import_export/formats/base_formats.py
@ -208,9 +208,18 @@ class XLSX(TablibFormat):
        rows = sheet.rows
        dataset.headers = [cell.value for cell in next(rows)]

+        ignore_blanks = getattr(
+            settings, "IMPORT_EXPORT_IMPORT_IGNORE_BLANK_LINES", False
+        )
        for row in rows:
            row_values = [cell.value for cell in row]
-            dataset.append(row_values)
+
+            if ignore_blanks:
+                # do not add empty rows to dataset
+                if not all(value is None for value in row_values):
+                    dataset.append(row_values)
+            else:
+                dataset.append(row_values)
        return dataset

    def export_data(self, dataset, **kwargs):
--- a/tests/core/tests/test_base_formats.py
+++ b/tests/core/tests/test_base_formats.py
@ -1,10 +1,12 @@
 import os
 import unittest
+from io import BytesIO
 from unittest import mock

+import openpyxl
 import tablib
 from core.tests.utils import ignore_utcnow_deprecation_warning
-from django.test import TestCase
+from django.test import TestCase, override_settings
 from django.utils.encoding import force_str
 from tablib.core import UnsupportedFormat

@ -115,6 +117,62 @@ class XLSXTest(TestCase):
            unittest.mock.ANY, read_only=True, data_only=True
        )

+    @override_settings(IMPORT_EXPORT_IMPORT_IGNORE_BLANK_LINES=False)
+    def test_xlsx_create_dataset__empty_rows(self):
+        """Default situation without the flag: do not ignore the empty rows for
+        backwards compatibility.
+        """
+        rows_before = 3
+        empty_rows = 5
+        rows_after = 2
+
+        wb = openpyxl.Workbook()
+        ws = wb.active
+        ws.append(["Header1", "Header2", "Header3"])
+
+        for _ in range(rows_before):
+            ws.append(["Data1", "Data2", "Data3"])
+
+        for _ in range(empty_rows):
+            ws.append([None, None, None])
+
+        for _ in range(rows_after):
+            ws.append(["Data1", "Data2", "Data3"])
+
+        xlsx_data = BytesIO()
+        wb.save(xlsx_data)
+        xlsx_data.seek(0)
+
+        dataset = self.format.create_dataset(xlsx_data.getvalue())
+        assert len(dataset) == rows_before + empty_rows + rows_after  # With empty rows
+
+    @override_settings(IMPORT_EXPORT_IMPORT_IGNORE_BLANK_LINES=True)
+    def test_xlsx_create_dataset__ignore_empty_rows(self):
+        """Ensure that empty rows are not added to the dataset."""
+        rows_before = 3
+        empty_rows = 5
+        rows_after = 2
+
+        wb = openpyxl.Workbook()
+        ws = wb.active
+        ws.append(["Header1", "Header2", "Header3"])
+
+        for _ in range(rows_before):
+            ws.append(["Data1", "Data2", "Data3"])
+
+        for _ in range(empty_rows):
+            ws.append([None, None, None])
+
+        for _ in range(rows_after):
+            ws.append(["Data1", "Data2", "Data3"])
+
+        xlsx_data = BytesIO()
+        wb.save(xlsx_data)
+        xlsx_data.seek(0)
+
+        dataset = self.format.create_dataset(xlsx_data.getvalue())
+        assert len(dataset) == rows_before + rows_after  # Without empty rows
+

 class CSVTest(TestCase):
    def setUp(self):