Skip empty rows in XLSX (#2028)

* Prevent empty lines in XLSX

* Test for create_dataset empty rows

* Update AUTHORS

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

* Add IMPORT_EXPORT_IMPORT_IGNORE_BLANK_LINES flag

* Add docs

* Update tests

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* updated docs

* updated changelog

* Update changelog.rst

* performance improvement

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: matthewhegarty <mrhegarty@gmail.com>
Co-authored-by: Matt Hegarty <matthewhegarty@users.noreply.github.com>
This commit is contained in:
Jurrian Tromp 2025-02-07 15:50:28 +01:00 committed by GitHub
parent ffe94d16d4
commit 8a6586090b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 96 additions and 4 deletions

View File

@ -158,3 +158,4 @@ The following is a list of much appreciated contributors:
* 19greg96 (Gergely Karz)
* AyushDharDubey
* dahvo (David Mark Awad)
* jurrian

View File

@ -5,6 +5,11 @@ Changelog
If upgrading from v3, v4 introduces breaking changes. Please refer to :doc:`release notes<release_notes>`.
4.3.6 (unreleased)
------------------
- Add flag to ignore empty rows in XLSX import (`2028 <https://github.com/django-import-export/django-import-export/issues/2028>`_)
4.3.5 (2025-02-01)
------------------

View File

@ -184,12 +184,21 @@ How to create relation during import if it does not exist
See :ref:`creating-non-existent-relations`.
How to handle large file uploads
---------------------------------
How to handle large file imports
--------------------------------
If uploading large files, you may encounter time-outs.
See :ref:`Using celery<celery>` and :ref:`bulk_import:Bulk imports`.
Performance issues or unexpected behavior during import
-------------------------------------------------------
This could be due to hidden rows in Excel files.
Hidden rows can be excluded using :ref:`import_export_import_ignore_blank_lines`.
Refer to `this PR <https://github.com/django-import-export/django-import-export/pull/2028>`_ for more information.
How to use field other than `id` in Foreign Key lookup
------------------------------------------------------

View File

@ -254,6 +254,16 @@ The values must be those provided in ``import_export.formats.base_formats`` e.g
This can be set for a specific model admin by declaring the ``export_formats`` attribute.
.. _import_export_import_ignore_blank_lines:
``IMPORT_EXPORT_IMPORT_IGNORE_BLANK_LINES``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If set to ``True``, rows without content will be ignored in XSLX imports.
This prevents an old Excel 1.0 bug which causes openpyxl ``max_rows`` to be counting all
logical empty rows. Some editors (like LibreOffice) might add :math:`2^{20}` empty rows to the
file, which causes a significant slowdown. By default this is ``False``.
.. _exampleapp:
Example app

View File

@ -208,9 +208,18 @@ class XLSX(TablibFormat):
rows = sheet.rows
dataset.headers = [cell.value for cell in next(rows)]
ignore_blanks = getattr(
settings, "IMPORT_EXPORT_IMPORT_IGNORE_BLANK_LINES", False
)
for row in rows:
row_values = [cell.value for cell in row]
dataset.append(row_values)
if ignore_blanks:
# do not add empty rows to dataset
if not all(value is None for value in row_values):
dataset.append(row_values)
else:
dataset.append(row_values)
return dataset
def export_data(self, dataset, **kwargs):

View File

@ -1,10 +1,12 @@
import os
import unittest
from io import BytesIO
from unittest import mock
import openpyxl
import tablib
from core.tests.utils import ignore_utcnow_deprecation_warning
from django.test import TestCase
from django.test import TestCase, override_settings
from django.utils.encoding import force_str
from tablib.core import UnsupportedFormat
@ -115,6 +117,62 @@ class XLSXTest(TestCase):
unittest.mock.ANY, read_only=True, data_only=True
)
@override_settings(IMPORT_EXPORT_IMPORT_IGNORE_BLANK_LINES=False)
def test_xlsx_create_dataset__empty_rows(self):
"""Default situation without the flag: do not ignore the empty rows for
backwards compatibility.
"""
rows_before = 3
empty_rows = 5
rows_after = 2
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["Header1", "Header2", "Header3"])
for _ in range(rows_before):
ws.append(["Data1", "Data2", "Data3"])
for _ in range(empty_rows):
ws.append([None, None, None])
for _ in range(rows_after):
ws.append(["Data1", "Data2", "Data3"])
xlsx_data = BytesIO()
wb.save(xlsx_data)
xlsx_data.seek(0)
dataset = self.format.create_dataset(xlsx_data.getvalue())
assert len(dataset) == rows_before + empty_rows + rows_after # With empty rows
@override_settings(IMPORT_EXPORT_IMPORT_IGNORE_BLANK_LINES=True)
def test_xlsx_create_dataset__ignore_empty_rows(self):
"""Ensure that empty rows are not added to the dataset."""
rows_before = 3
empty_rows = 5
rows_after = 2
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["Header1", "Header2", "Header3"])
for _ in range(rows_before):
ws.append(["Data1", "Data2", "Data3"])
for _ in range(empty_rows):
ws.append([None, None, None])
for _ in range(rows_after):
ws.append(["Data1", "Data2", "Data3"])
xlsx_data = BytesIO()
wb.save(xlsx_data)
xlsx_data.seek(0)
dataset = self.format.create_dataset(xlsx_data.getvalue())
assert len(dataset) == rows_before + rows_after # Without empty rows
class CSVTest(TestCase):
def setUp(self):