From 3b8fa144c589dc9559cd99247285f54149d9767c Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Sat, 2 Mar 2024 20:45:12 +0100
Subject: [PATCH] fixes

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../hdk_on_native/test/test_dataframe.py      |  82 ++++-----
 modin/experimental/pandas/test/test_io_exp.py |  68 +++-----
 modin/pandas/test/dataframe/test_indexing.py  |  20 ++-
 modin/pandas/test/test_io.py                  | 163 +++++++++---------
 modin/pandas/test/utils.py                    |  22 +--
 5 files changed, 165 insertions(+), 190 deletions(-)

diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py
index b71d808a8cc..cc1f5b1242e 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py
@@ -18,7 +18,6 @@
 import pandas
 import pyarrow
 import pytest
-from pandas._testing import ensure_clean
 from pandas.core.dtypes.common import is_list_like
 from pyhdk import __version__ as hdk_version
 
@@ -26,6 +25,7 @@
 from modin.pandas.test.utils import (
     create_test_dfs,
     default_to_pandas_ignore_string,
+    get_unique_filename,
     io_ops_bad_exc,
     random_state,
     test_data,
@@ -324,17 +324,17 @@ def test_read_csv_datetime(
 
     @pytest.mark.parametrize("engine", [None, "arrow"])
     @pytest.mark.parametrize("parse_dates", [None, True, False])
-    def test_read_csv_datetime_tz(self, engine, parse_dates):
-        with ensure_clean(".csv") as file:
-            with open(file, "w") as f:
-                f.write("test\n2023-01-01T00:00:00.000-07:00")
+    def test_read_csv_datetime_tz(self, engine, parse_dates, tmp_path):
+        unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+        with open(unique_filename, "w") as f:
+            f.write("test\n2023-01-01T00:00:00.000-07:00")
 
-            eval_io(
-                fn_name="read_csv",
-                filepath_or_buffer=file,
-                md_extra_kwargs={"engine": engine},
-                parse_dates=parse_dates,
-            )
+        eval_io(
+            fn_name="read_csv",
+            filepath_or_buffer=unique_filename,
+            md_extra_kwargs={"engine": engine},
+            parse_dates=parse_dates,
+        )
 
     @pytest.mark.parametrize("engine", [None, "arrow"])
     @pytest.mark.parametrize(
@@ -382,26 +382,26 @@ def test_read_csv_col_handling(
             "c1.1,c1,c1.1,c1,c1.1,c1.2,c1.2,c2",
         ],
     )
-    def test_read_csv_duplicate_cols(self, cols):
+    def test_read_csv_duplicate_cols(self, cols, tmp_path):
         def test(df, lib, **kwargs):
             data = f"{cols}\n"
-            with ensure_clean(".csv") as fname:
-                with open(fname, "w") as f:
-                    f.write(data)
-                return lib.read_csv(fname)
+            unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+            with open(unique_filename, "w") as f:
+                f.write(data)
+            return lib.read_csv(unique_filename)
 
         run_and_compare(test, data={})
 
-    def test_read_csv_dtype_object(self):
+    def test_read_csv_dtype_object(self, tmp_path):
         with pytest.warns(UserWarning) as warns:
-            with ensure_clean(".csv") as file:
-                with open(file, "w") as f:
-                    f.write("test\ntest")
+            unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+            with open(unique_filename, "w") as f:
+                f.write("test\ntest")
 
-                def test(**kwargs):
-                    return pd.read_csv(file, dtype={"test": "object"})
+            def test(**kwargs):
+                return pd.read_csv(unique_filename, dtype={"test": "object"})
 
-                run_and_compare(test, data={})
+            run_and_compare(test, data={})
             for warn in warns.list:
                 assert not re.match(r".*defaulting to pandas.*", str(warn))
 
@@ -870,30 +870,30 @@ def concat(df1, df2, lib, **kwargs):
     @pytest.mark.parametrize("transform", [True, False])
     @pytest.mark.parametrize("sort_last", [True, False])
     # RecursionError in case of concatenation of big number of frames
-    def test_issue_5889(self, transform, sort_last):
-        with ensure_clean(".csv") as file:
-            data = {"a": [1, 2, 3], "b": [1, 2, 3]} if transform else {"a": [1, 2, 3]}
-            pandas.DataFrame(data).to_csv(file, index=False)
+    def test_issue_5889(self, transform, sort_last, tmp_path):
+        unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+        data = {"a": [1, 2, 3], "b": [1, 2, 3]} if transform else {"a": [1, 2, 3]}
+        pandas.DataFrame(data).to_csv(unique_filename, index=False)
 
-            def test_concat(lib, **kwargs):
-                if transform:
+        def test_concat(lib, **kwargs):
+            if transform:
 
-                    def read_csv():
-                        return lib.read_csv(file)["b"]
+                def read_csv():
+                    return lib.read_csv(unique_filename)["b"]
 
-                else:
+            else:
 
-                    def read_csv():
-                        return lib.read_csv(file)
+                def read_csv():
+                    return lib.read_csv(unique_filename)
 
-                df = read_csv()
-                for _ in range(100):
-                    df = lib.concat([df, read_csv()])
-                if sort_last:
-                    df = lib.concat([df, read_csv()], sort=True)
-                return df
+            df = read_csv()
+            for _ in range(100):
+                df = lib.concat([df, read_csv()])
+            if sort_last:
+                df = lib.concat([df, read_csv()], sort=True)
+            return df
 
-            run_and_compare(test_concat, data={})
+        run_and_compare(test_concat, data={})
 
 
 class TestGroupby:
diff --git a/modin/experimental/pandas/test/test_io_exp.py b/modin/experimental/pandas/test/test_io_exp.py
index ed091dee10a..8e9ebdf0df7 100644
--- a/modin/experimental/pandas/test/test_io_exp.py
+++ b/modin/experimental/pandas/test/test_io_exp.py
@@ -18,13 +18,13 @@
 import numpy as np
 import pandas
 import pytest
-from pandas._testing import ensure_clean
 
 import modin.experimental.pandas as pd
-from modin.config import AsyncReadMode, Engine
+from modin.config import Engine
 from modin.pandas.test.utils import (
     df_equals,
     eval_general,
+    get_unique_filename,
     parse_dates_values_by_id,
     test_data,
     time_parsing_csv_path,
@@ -355,7 +355,7 @@ def test_xml_glob(tmp_path, filename):
     reason=f"{Engine.get()} does not have experimental read_custom_text API",
 )
 @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True)
-def test_read_custom_json_text(set_async_read_mode):
+def test_read_custom_json_text(set_async_read_mode, tmp_path):
     def _generate_json(file_name, nrows, ncols):
         data = np.random.rand(nrows, ncols)
         df = pandas.DataFrame(data, columns=[f"col{x}" for x in range(ncols)])
@@ -374,25 +374,19 @@ def _custom_parser(io_input, **kwargs):
                 result[key].append(obj[key])
         return pandas.DataFrame(result).rename(columns={"col0": "testID"})
 
-    with ensure_clean() as filename:
-        _generate_json(filename, 64, 8)
+    unique_filename = get_unique_filename(data_dir=tmp_path)
+    _generate_json(unique_filename, 64, 8)
 
-        df1 = pd.read_custom_text(
-            filename,
-            columns=["testID", "col1", "col3"],
-            custom_parser=_custom_parser,
-            is_quoting=False,
-        )
-        df2 = pd.read_json(filename, lines=True)[["col0", "col1", "col3"]].rename(
-            columns={"col0": "testID"}
-        )
-        if AsyncReadMode.get():
-            # If read operations are asynchronous, then the dataframes
-            # check should be inside `ensure_clean` context
-            # because the file may be deleted before actual reading starts
-            df_equals(df1, df2)
-    if not AsyncReadMode.get():
-        df_equals(df1, df2)
+    df1 = pd.read_custom_text(
+        unique_filename,
+        columns=["testID", "col1", "col3"],
+        custom_parser=_custom_parser,
+        is_quoting=False,
+    )
+    df2 = pd.read_json(unique_filename, lines=True)[["col0", "col1", "col3"]].rename(
+        columns={"col0": "testID"}
+    )
+    df_equals(df1, df2)
 
 
 @pytest.mark.skipif(
@@ -400,7 +394,7 @@ def _custom_parser(io_input, **kwargs):
     reason=f"{Engine.get()} does not have experimental API",
 )
 @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True)
-def test_read_evaluated_dict(set_async_read_mode):
+def test_read_evaluated_dict(set_async_read_mode, tmp_path):
     def _generate_evaluated_dict(file_name, nrows, ncols):
         result = {}
         keys = [f"col{x}" for x in range(ncols)]
@@ -430,23 +424,17 @@ def columns_callback(io_input, **kwargs):
             break
         return columns
 
-    with ensure_clean() as filename:
-        _generate_evaluated_dict(filename, 64, 8)
+    unique_filename = get_unique_filename(data_dir=tmp_path)
+    _generate_evaluated_dict(unique_filename, 64, 8)
 
-        df1 = pd.read_custom_text(
-            filename,
-            columns=["col1", "col2"],
-            custom_parser=_custom_parser,
-        )
-        assert df1.shape == (64, 2)
+    df1 = pd.read_custom_text(
+        unique_filename,
+        columns=["col1", "col2"],
+        custom_parser=_custom_parser,
+    )
+    assert df1.shape == (64, 2)
 
-        df2 = pd.read_custom_text(
-            filename, columns=columns_callback, custom_parser=_custom_parser
-        )
-        if AsyncReadMode.get():
-            # If read operations are asynchronous, then the dataframes
-            # check should be inside `ensure_clean` context
-            # because the file may be deleted before actual reading starts
-            df_equals(df1, df2)
-    if not AsyncReadMode.get():
-        df_equals(df1, df2)
+    df2 = pd.read_custom_text(
+        unique_filename, columns=columns_callback, custom_parser=_custom_parser
+    )
+    df_equals(df1, df2)
diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py
index bf58b572168..0b165a391f5 100644
--- a/modin/pandas/test/dataframe/test_indexing.py
+++ b/modin/pandas/test/dataframe/test_indexing.py
@@ -17,7 +17,6 @@
 import numpy as np
 import pandas
 import pytest
-from pandas._testing import ensure_clean
 from pandas.testing import assert_index_equal
 
 import modin.pandas as pd
@@ -35,6 +34,7 @@
     df_equals,
     eval_general,
     generate_multiindex,
+    get_unique_filename,
     int_arg_keys,
     int_arg_values,
     name_contains,
@@ -2207,14 +2207,16 @@ def test___setitem__partitions_aligning():
     df_equals(md_df, pd_df)
 
 
-def test___setitem__with_mismatched_partitions():
-    with ensure_clean(".csv") as fname:
-        np.savetxt(fname, np.random.randint(0, 100, size=(200_000, 99)), delimiter=",")
-        modin_df = pd.read_csv(fname)
-        pandas_df = pandas.read_csv(fname)
-        modin_df["new"] = pd.Series(list(range(len(modin_df))))
-        pandas_df["new"] = pandas.Series(list(range(len(pandas_df))))
-        df_equals(modin_df, pandas_df)
+def test___setitem__with_mismatched_partitions(tmp_path):
+    unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+    np.savetxt(
+        unique_filename, np.random.randint(0, 100, size=(200_000, 99)), delimiter=","
+    )
+    modin_df = pd.read_csv(unique_filename)
+    pandas_df = pandas.read_csv(unique_filename)
+    modin_df["new"] = pd.Series(list(range(len(modin_df))))
+    pandas_df["new"] = pandas.Series(list(range(len(pandas_df))))
+    df_equals(modin_df, pandas_df)
 
 
 def test___setitem__mask():
diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
index 09945c669d9..46ec1e36973 100644
--- a/modin/pandas/test/test_io.py
+++ b/modin/pandas/test/test_io.py
@@ -31,7 +31,6 @@
 import pytest
 import sqlalchemy as sa
 from packaging import version
-from pandas._testing import ensure_clean
 from pandas.errors import ParserWarning
 from scipy import sparse
 
@@ -472,16 +471,16 @@ def test_read_csv_parsing_3(
             nrows=nrows,
         )
 
-    def test_read_csv_skipinitialspace(self):
-        with ensure_clean(".csv") as unique_filename:
-            str_initial_spaces = (
-                "col1,col2,col3,col4\n"
-                + "five,  six,  seven,  eight\n"
-                + "    five,    six,    seven,    eight\n"
-                + "five, six,  seven,   eight\n"
-            )
+    def test_read_csv_skipinitialspace(self, tmp_path):
+        unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+        str_initial_spaces = (
+            "col1,col2,col3,col4\n"
+            + "five,  six,  seven,  eight\n"
+            + "    five,    six,    seven,    eight\n"
+            + "five, six,  seven,   eight\n"
+        )
 
-            eval_io_from_str(str_initial_spaces, unique_filename, skipinitialspace=True)
+        eval_io_from_str(str_initial_spaces, unique_filename, skipinitialspace=True)
 
     # NA and Missing Data Handling tests
     @pytest.mark.parametrize("na_values", ["custom_nan", "73"])
@@ -552,17 +551,17 @@ def test_read_csv_datetime(
     @pytest.mark.parametrize("date", ["2023-01-01 00:00:01.000000000", "2023"])
     @pytest.mark.parametrize("dtype", [None, "str", {"id": "int64"}])
     @pytest.mark.parametrize("parse_dates", [None, [], ["date"], [1]])
-    def test_read_csv_dtype_parse_dates(self, date, dtype, parse_dates):
-        with ensure_clean(".csv") as filename:
-            with open(filename, "w") as file:
-                file.write(f"id,date\n1,{date}")
-            eval_io(
-                fn_name="read_csv",
-                # read_csv kwargs
-                filepath_or_buffer=filename,
-                dtype=dtype,
-                parse_dates=parse_dates,
-            )
+    def test_read_csv_dtype_parse_dates(self, date, dtype, parse_dates, tmp_path):
+        unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+        with open(unique_filename, "w") as file:
+            file.write(f"id,date\n1,{date}")
+        eval_io(
+            fn_name="read_csv",
+            # read_csv kwargs
+            filepath_or_buffer=unique_filename,
+            dtype=dtype,
+            parse_dates=parse_dates,
+        )
 
     # Iteration tests
     @pytest.mark.parametrize("iterator", [True, False])
@@ -865,13 +864,12 @@ def test_read_csv_internal(
 
     # Issue related, specific or corner cases
     @pytest.mark.parametrize("nrows", [2, None])
-    def test_read_csv_bad_quotes(self, nrows):
+    def test_read_csv_bad_quotes(self, nrows, tmp_path):
         csv_bad_quotes = (
             '1, 2, 3, 4\none, two, three, four\nfive, "six", seven, "eight\n'
         )
-
-        with ensure_clean(".csv") as unique_filename:
-            eval_io_from_str(csv_bad_quotes, unique_filename, nrows=nrows)
+        unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+        eval_io_from_str(csv_bad_quotes, unique_filename, nrows=nrows)
 
     def test_read_csv_categories(self):
         eval_io(
@@ -1247,20 +1245,13 @@ def test_to_csv_with_index(self, tmp_path):
         eval_to_csv_file(tmp_path, modin_df, pandas_df, "csv")
 
     @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True)
-    def test_read_csv_issue_5150(self, set_async_read_mode):
-        with ensure_clean(".csv") as unique_filename:
-            pandas_df = pandas.DataFrame(np.random.randint(0, 100, size=(2**6, 2**6)))
-            pandas_df.to_csv(unique_filename, index=False)
-            expected_pandas_df = pandas.read_csv(unique_filename, index_col=False)
-            modin_df = pd.read_csv(unique_filename, index_col=False)
-            actual_pandas_df = modin_df._to_pandas()
-            if AsyncReadMode.get():
-                # If read operations are asynchronous, then the dataframes
-                # check should be inside `ensure_clean` context
-                # because the file may be deleted before actual reading starts
-                df_equals(expected_pandas_df, actual_pandas_df)
-        if not AsyncReadMode.get():
-            df_equals(expected_pandas_df, actual_pandas_df)
+    def test_read_csv_issue_5150(self, set_async_read_mode, tmp_path):
+        unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+        pandas_df = pandas.DataFrame(np.random.randint(0, 100, size=(2**6, 2**6)))
+        pandas_df.to_csv(unique_filename, index=False)
+        expected_pandas_df = pandas.read_csv(unique_filename, index_col=False)
+        modin_actual_df = pd.read_csv(unique_filename, index_col=False)
+        df_equals(expected_pandas_df, modin_actual_df)
 
     @pytest.mark.parametrize("usecols", [None, [0, 1, 2, 3, 4]])
     def test_read_csv_1930(self, usecols):
@@ -1424,18 +1415,18 @@ def comparator(df1, df2):
         )
 
     # Tests issue #6778
-    def test_read_parquet_no_extension(self, engine, make_parquet_file):
-        with ensure_clean(".parquet") as unique_filename:
-            # Remove the .parquet extension
-            no_ext_fname = unique_filename[: unique_filename.index(".parquet")]
+    def test_read_parquet_no_extension(self, engine, make_parquet_file, tmp_path):
+        unique_filename = get_unique_filename(extension="parquet", data_dir=tmp_path)
+        # Remove the .parquet extension
+        no_ext_fname = unique_filename[: unique_filename.index(".parquet")]
 
-            make_parquet_file(filename=no_ext_fname)
-            eval_io(
-                fn_name="read_parquet",
-                # read_parquet kwargs
-                engine=engine,
-                path=no_ext_fname,
-            )
+        make_parquet_file(filename=no_ext_fname)
+        eval_io(
+            fn_name="read_parquet",
+            # read_parquet kwargs
+            engine=engine,
+            path=no_ext_fname,
+        )
 
     @pytest.mark.parametrize(
         "filters",
@@ -1762,7 +1753,7 @@ def test_read_parquet_partitioned_directory(
             ],
         ],
     )
-    def test_read_parquet_pandas_index(self, engine, filters):
+    def test_read_parquet_pandas_index(self, engine, filters, tmp_path):
         if (
             version.parse(pa.__version__) >= version.parse("12.0.0")
             and version.parse(pd.__version__) < version.parse("2.0.0")
@@ -1815,26 +1806,28 @@ def test_read_parquet_pandas_index(self, engine, filters):
                 ):
                     continue
 
-                with ensure_clean(".parquet") as unique_filename:
-                    pandas_df.set_index(col).to_parquet(unique_filename)
-                    # read the same parquet using modin.pandas
-                    eval_io(
-                        "read_parquet",
-                        # read_parquet kwargs
-                        path=unique_filename,
-                        engine=engine,
-                        filters=filters,
-                    )
-
-        with ensure_clean(".parquet") as unique_filename:
-            pandas_df.set_index(["idx", "A"]).to_parquet(unique_filename)
-            eval_io(
-                "read_parquet",
-                # read_parquet kwargs
-                path=unique_filename,
-                engine=engine,
-                filters=filters,
-            )
+                unique_filename = get_unique_filename(
+                    extension="parquet", data_dir=tmp_path
+                )
+                pandas_df.set_index(col).to_parquet(unique_filename)
+                # read the same parquet using modin.pandas
+                eval_io(
+                    "read_parquet",
+                    # read_parquet kwargs
+                    path=unique_filename,
+                    engine=engine,
+                    filters=filters,
+                )
+
+        unique_filename = get_unique_filename(extension="parquet", data_dir=tmp_path)
+        pandas_df.set_index(["idx", "A"]).to_parquet(unique_filename)
+        eval_io(
+            "read_parquet",
+            # read_parquet kwargs
+            path=unique_filename,
+            engine=engine,
+            filters=filters,
+        )
 
     @pytest.mark.parametrize(
         "filters",
@@ -2444,24 +2437,24 @@ def test_HDFStore(self, tmp_path):
         df_equals(modin_df, pandas_df)
         assert isinstance(modin_store, pd.HDFStore)
 
-        with ensure_clean(".hdf5") as hdf_file:
-            with pd.HDFStore(hdf_file, mode="w") as store:
-                store.append("data/df1", pd.DataFrame(np.random.randn(5, 5)))
-                store.append("data/df2", pd.DataFrame(np.random.randn(4, 4)))
+        unique_filename = get_unique_filename(extension="hdf5", data_dir=tmp_path)
+        with pd.HDFStore(unique_filename, mode="w") as store:
+            store.append("data/df1", pd.DataFrame(np.random.randn(5, 5)))
+            store.append("data/df2", pd.DataFrame(np.random.randn(4, 4)))
 
-            modin_df = pd.read_hdf(hdf_file, key="data/df1", mode="r")
-            pandas_df = pandas.read_hdf(hdf_file, key="data/df1", mode="r")
+        modin_df = pd.read_hdf(unique_filename, key="data/df1", mode="r")
+        pandas_df = pandas.read_hdf(unique_filename, key="data/df1", mode="r")
         df_equals(modin_df, pandas_df)
 
-    def test_HDFStore_in_read_hdf(self):
-        with ensure_clean(".hdf") as filename:
-            dfin = pd.DataFrame(np.random.rand(8, 8))
-            dfin.to_hdf(filename, "/key")
+    def test_HDFStore_in_read_hdf(self, tmp_path):
+        unique_filename = get_unique_filename(extension="hdf", data_dir=tmp_path)
+        dfin = pd.DataFrame(np.random.rand(8, 8))
+        dfin.to_hdf(unique_filename, "/key")
 
-            with pd.HDFStore(filename) as h:
-                modin_df = pd.read_hdf(h, "/key")
-            with pandas.HDFStore(filename) as h:
-                pandas_df = pandas.read_hdf(h, "/key")
+        with pd.HDFStore(unique_filename) as h:
+            modin_df = pd.read_hdf(h, "/key")
+        with pandas.HDFStore(unique_filename) as h:
+            pandas_df = pandas.read_hdf(h, "/key")
         df_equals(modin_df, pandas_df)
 
 
diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py
index 55420b1d46f..3fb161860eb 100644
--- a/modin/pandas/test/utils.py
+++ b/modin/pandas/test/utils.py
@@ -1014,22 +1014,14 @@ def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs):
     unique_filename: str
         csv file name.
     """
-    try:
-        with open(unique_filename, "w") as f:
-            f.write(csv_str)
+    with open(unique_filename, "w") as f:
+        f.write(csv_str)
 
-        eval_io(
-            filepath_or_buffer=unique_filename,
-            fn_name="read_csv",
-            **kwargs,
-        )
-
-    finally:
-        if os.path.exists(unique_filename):
-            try:
-                os.remove(unique_filename)
-            except PermissionError:
-                pass
+    eval_io(
+        filepath_or_buffer=unique_filename,
+        fn_name="read_csv",
+        **kwargs,
+    )
 
 
 def create_test_dfs(*args, **kwargs):