From 2722454671599eb36e5b56efcd9c3f2e1a0e0f34 Mon Sep 17 00:00:00 2001 From: banflam Date: Fri, 24 Jan 2025 14:32:49 -0500 Subject: [PATCH 1/8] Added error checking for max Excel rows, and raised exception when the max rows are exceeded for Excel --- py-polars/polars/dataframe/frame.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 30fe3691635e..c5e808663bc8 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -97,6 +97,7 @@ from polars.dependencies import pandas as pd from polars.dependencies import pyarrow as pa from polars.exceptions import ( + InvalidOperationError, ColumnNotFoundError, ModuleUpgradeRequiredError, NoRowsReturnedError, @@ -3505,6 +3506,11 @@ def write_excel( + int(bool(column_totals)), table_start[1] + len(df.columns) - 1, ) + + excel_max_valid_rows = 1048575 + if len(self.rows) > excel_max_valid_rows: + msg = "Dataframe too large to be compatible with Excel. Exceeded Excel limit of 1048575 rows of data." + raise InvalidOperationError(msg) # write table structure and formats into the target sheet if not is_empty or include_header: From cdd74c1a27504d03941bb0061e679cb83129f3fa Mon Sep 17 00:00:00 2001 From: banflam Date: Fri, 24 Jan 2025 15:21:03 -0500 Subject: [PATCH 2/8] fix(python): Throw exception if dataset is too large for Excel --- py-polars/polars/dataframe/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index c5e808663bc8..465a2962aa4a 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -97,8 +97,8 @@ from polars.dependencies import pandas as pd from polars.dependencies import pyarrow as pa from polars.exceptions import ( - InvalidOperationError, ColumnNotFoundError, + InvalidOperationError, ModuleUpgradeRequiredError, NoRowsReturnedError, TooManyRowsReturnedError, @@ -3506,9 +3506,9 @@ def write_excel( + int(bool(column_totals)), table_start[1] + len(df.columns) - 1, ) - + excel_max_valid_rows = 1048575 - if len(self.rows) > excel_max_valid_rows: + if self.height > excel_max_valid_rows: msg = "Dataframe too large to be compatible with Excel. Exceeded Excel limit of 1048575 rows of data." raise InvalidOperationError(msg) From b59eb96697bb8793be601f01e27d23d68db04952 Mon Sep 17 00:00:00 2001 From: banflam Date: Sat, 25 Jan 2025 14:11:21 -0500 Subject: [PATCH 3/8] accounted for headers which will write a row, starting offset [numerical offset given in the form of an int tuple only], and checked to ensure max Excel cols are also not exceeded --- py-polars/polars/dataframe/frame.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 465a2962aa4a..f8090dabcb3f 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -3507,9 +3507,25 @@ def write_excel( table_start[1] + len(df.columns) - 1, ) + # verify that the number of rows and columns, after accounting for an offset and a header row, is within the maximum for Excel excel_max_valid_rows = 1048575 - if self.height > excel_max_valid_rows: - msg = "Dataframe too large to be compatible with Excel. Exceeded Excel limit of 1048575 rows of data." + excel_max_valid_cols = 16384 + + initial_row_offset = 0 + initial_col_offset = 0 + + # it is just a numerical offset given in the form of a tuple + if isinstance(position, tuple): + initial_row_offset, initial_col_offset = position + + total_rows = self.height + initial_row_offset + if include_header: + total_rows += 1 + + total_cols = self.width + initial_col_offset + + if total_rows > excel_max_valid_rows or total_cols > excel_max_valid_cols: + msg = "Dataframe too large to be compatible with Excel. Exceeded Excel limit of 1048575 rows and/or 16384 columns of data." raise InvalidOperationError(msg) # write table structure and formats into the target sheet From f1ff63f68f91cf1bd0e3ad8963083abd5a85bedc Mon Sep 17 00:00:00 2001 From: banflam Date: Sat, 25 Jan 2025 14:19:14 -0500 Subject: [PATCH 4/8] fixed formatting issue of comment too long --- py-polars/polars/dataframe/frame.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index f8090dabcb3f..0cf2b66649db 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -3507,23 +3507,25 @@ def write_excel( table_start[1] + len(df.columns) - 1, ) - # verify that the number of rows and columns, after accounting for an offset and a header row, is within the maximum for Excel + # verify the number of rows and columns, + # after accounting for an offset and a header row + # is within the maximum for Excel excel_max_valid_rows = 1048575 excel_max_valid_cols = 16384 - + initial_row_offset = 0 initial_col_offset = 0 - + # it is just a numerical offset given in the form of a tuple if isinstance(position, tuple): initial_row_offset, initial_col_offset = position - + total_rows = self.height + initial_row_offset if include_header: total_rows += 1 - + total_cols = self.width + initial_col_offset - + if total_rows > excel_max_valid_rows or total_cols > excel_max_valid_cols: msg = "Dataframe too large to be compatible with Excel. Exceeded Excel limit of 1048575 rows and/or 16384 columns of data." raise InvalidOperationError(msg) From 1b5659f148a9d8f3807bf22d712d2c6264ba717e Mon Sep 17 00:00:00 2001 From: banflam Date: Mon, 27 Jan 2025 12:06:18 -0500 Subject: [PATCH 5/8] removed old Excel limit checking code and replaced it with code that relies on the table_finish variable --- py-polars/polars/dataframe/frame.py | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 0cf2b66649db..52e66956f3bb 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -3507,27 +3507,14 @@ def write_excel( table_start[1] + len(df.columns) - 1, ) - # verify the number of rows and columns, - # after accounting for an offset and a header row - # is within the maximum for Excel excel_max_valid_rows = 1048575 excel_max_valid_cols = 16384 - initial_row_offset = 0 - initial_col_offset = 0 - - # it is just a numerical offset given in the form of a tuple - if isinstance(position, tuple): - initial_row_offset, initial_col_offset = position - - total_rows = self.height + initial_row_offset - if include_header: - total_rows += 1 - - total_cols = self.width + initial_col_offset - - if total_rows > excel_max_valid_rows or total_cols > excel_max_valid_cols: - msg = "Dataframe too large to be compatible with Excel. Exceeded Excel limit of 1048575 rows and/or 16384 columns of data." + if ( + table_finish[0] > excel_max_valid_rows + or table_finish[1] > excel_max_valid_cols + ): + msg = "Dataframe too large to be compatible with Excel. Exceeds Excel limit of 1048575 rows and/or 16384 columns of data." raise InvalidOperationError(msg) # write table structure and formats into the target sheet From 6a3b126fee879ca6437860a51617cf7380fb828c Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 29 Jan 2025 16:02:19 +0400 Subject: [PATCH 6/8] Update py-polars/polars/dataframe/frame.py --- py-polars/polars/dataframe/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 52e66956f3bb..372d98036f16 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -3514,7 +3514,7 @@ def write_excel( table_finish[0] > excel_max_valid_rows or table_finish[1] > excel_max_valid_cols ): - msg = "Dataframe too large to be compatible with Excel. Exceeds Excel limit of 1048575 rows and/or 16384 columns of data." + msg = f"writing {df.height}x{df.width} frame at {position!r} does not fit worksheet dimensions of {excel_max_valid_rows} rows and {excel_max_valid_cols} columns" raise InvalidOperationError(msg) # write table structure and formats into the target sheet From 137041f45854742a1d337d3b5c6f4677232cf61f Mon Sep 17 00:00:00 2001 From: banflam Date: Wed, 29 Jan 2025 12:30:08 -0500 Subject: [PATCH 7/8] added test to test that the InvalidOperationError is being raised correctly when exceeding max rows --- py-polars/tests/unit/io/test_spreadsheet.py | 38 ++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index a5ac7a568d99..5d4f8139c329 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -12,7 +12,7 @@ import polars as pl import polars.selectors as cs -from polars.exceptions import NoDataError, ParameterCollisionError +from polars.exceptions import NoDataError, ParameterCollisionError, InvalidOperationError from polars.testing import assert_frame_equal, assert_series_equal from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES @@ -1183,6 +1183,42 @@ def test_excel_write_worksheet_object() -> None: with Workbook(BytesIO()) as wb: df.write_excel(None, worksheet=ws) +def test_excel_write_beyond_max_rows_cols() -> None: + + with pytest.raises(InvalidOperationError): + path = "/tmp/test.xlsx" + sheet = "mysheet" + ( + pl + .LazyFrame({ + "x": "a" + }) + .select(pl.repeat(pl.col("x"), 1048576)) + .collect() + .write_excel( + workbook=path, + worksheet=sheet, + ) + ) + + pl.read_excel(source=path, sheet_name=sheet) + + + with pytest.raises(InvalidOperationError): + path = "/tmp/test.xlsx" + sheet = "mysheet" + ( + pl + .DataFrame({ + "col1": range(10), + "col2": range(10, 20) + }) + .write_excel(workbook=path, worksheet=sheet,position="A1048570") + ) + + pl.read_excel(source=path, sheet_name=sheet) + + def test_excel_freeze_panes() -> None: from xlsxwriter import Workbook From 82b5afe60eeebd447243a151dc4f07d98eee38ae Mon Sep 17 00:00:00 2001 From: banflam Date: Wed, 29 Jan 2025 12:51:44 -0500 Subject: [PATCH 8/8] fixed formatting issues --- py-polars/tests/unit/io/test_spreadsheet.py | 43 +++++---------------- 1 file changed, 10 insertions(+), 33 deletions(-) diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index 5d4f8139c329..f0d03caf00fd 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -12,7 +12,10 @@ import polars as pl import polars.selectors as cs -from polars.exceptions import NoDataError, ParameterCollisionError, InvalidOperationError +from polars.exceptions import ( + NoDataError, + ParameterCollisionError, +) from polars.testing import assert_frame_equal, assert_series_equal from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES @@ -1183,41 +1186,15 @@ def test_excel_write_worksheet_object() -> None: with Workbook(BytesIO()) as wb: df.write_excel(None, worksheet=ws) -def test_excel_write_beyond_max_rows_cols() -> None: - with pytest.raises(InvalidOperationError): - path = "/tmp/test.xlsx" - sheet = "mysheet" - ( - pl - .LazyFrame({ - "x": "a" - }) - .select(pl.repeat(pl.col("x"), 1048576)) - .collect() - .write_excel( - workbook=path, - worksheet=sheet, - ) - ) - - pl.read_excel(source=path, sheet_name=sheet) - - - with pytest.raises(InvalidOperationError): - path = "/tmp/test.xlsx" - sheet = "mysheet" - ( - pl - .DataFrame({ - "col1": range(10), - "col2": range(10, 20) - }) - .write_excel(workbook=path, worksheet=sheet,position="A1048570") - ) +def test_excel_write_beyond_max_rows_cols() -> None: + path = "/tmp/test.xlsx" + sheet = "mysheet" - pl.read_excel(source=path, sheet_name=sheet) + df = pl.DataFrame({"col1": range(10), "col2": range(10, 20)}) + with pytest.raises(pl.exceptions.InvalidOperationError): + df.write_excel(workbook=path, worksheet=sheet, position="A1048570") def test_excel_freeze_panes() -> None: