From 3b8fa144c589dc9559cd99247285f54149d9767c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sat, 2 Mar 2024 20:45:12 +0100 Subject: [PATCH] fixes Signed-off-by: Anatoly Myachev --- .../hdk_on_native/test/test_dataframe.py | 82 ++++----- modin/experimental/pandas/test/test_io_exp.py | 68 +++----- modin/pandas/test/dataframe/test_indexing.py | 20 ++- modin/pandas/test/test_io.py | 163 +++++++++--------- modin/pandas/test/utils.py | 22 +-- 5 files changed, 165 insertions(+), 190 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py index b71d808a8cc..cc1f5b1242e 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py @@ -18,7 +18,6 @@ import pandas import pyarrow import pytest -from pandas._testing import ensure_clean from pandas.core.dtypes.common import is_list_like from pyhdk import __version__ as hdk_version @@ -26,6 +25,7 @@ from modin.pandas.test.utils import ( create_test_dfs, default_to_pandas_ignore_string, + get_unique_filename, io_ops_bad_exc, random_state, test_data, @@ -324,17 +324,17 @@ def test_read_csv_datetime( @pytest.mark.parametrize("engine", [None, "arrow"]) @pytest.mark.parametrize("parse_dates", [None, True, False]) - def test_read_csv_datetime_tz(self, engine, parse_dates): - with ensure_clean(".csv") as file: - with open(file, "w") as f: - f.write("test\n2023-01-01T00:00:00.000-07:00") + def test_read_csv_datetime_tz(self, engine, parse_dates, tmp_path): + unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path) + with open(unique_filename, "w") as f: + f.write("test\n2023-01-01T00:00:00.000-07:00") - eval_io( - fn_name="read_csv", - filepath_or_buffer=file, - md_extra_kwargs={"engine": engine}, - parse_dates=parse_dates, - ) + eval_io( + fn_name="read_csv", + filepath_or_buffer=unique_filename, + md_extra_kwargs={"engine": engine}, + parse_dates=parse_dates, + ) @pytest.mark.parametrize("engine", [None, "arrow"]) @pytest.mark.parametrize( @@ -382,26 +382,26 @@ def test_read_csv_col_handling( "c1.1,c1,c1.1,c1,c1.1,c1.2,c1.2,c2", ], ) - def test_read_csv_duplicate_cols(self, cols): + def test_read_csv_duplicate_cols(self, cols, tmp_path): def test(df, lib, **kwargs): data = f"{cols}\n" - with ensure_clean(".csv") as fname: - with open(fname, "w") as f: - f.write(data) - return lib.read_csv(fname) + unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path) + with open(unique_filename, "w") as f: + f.write(data) + return lib.read_csv(unique_filename) run_and_compare(test, data={}) - def test_read_csv_dtype_object(self): + def test_read_csv_dtype_object(self, tmp_path): with pytest.warns(UserWarning) as warns: - with ensure_clean(".csv") as file: - with open(file, "w") as f: - f.write("test\ntest") + unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path) + with open(unique_filename, "w") as f: + f.write("test\ntest") - def test(**kwargs): - return pd.read_csv(file, dtype={"test": "object"}) + def test(**kwargs): + return pd.read_csv(unique_filename, dtype={"test": "object"}) - run_and_compare(test, data={}) + run_and_compare(test, data={}) for warn in warns.list: assert not re.match(r".*defaulting to pandas.*", str(warn)) @@ -870,30 +870,30 @@ def concat(df1, df2, lib, **kwargs): @pytest.mark.parametrize("transform", [True, False]) @pytest.mark.parametrize("sort_last", [True, False]) # RecursionError in case of concatenation of big number of frames - def test_issue_5889(self, transform, sort_last): - with ensure_clean(".csv") as file: - data = {"a": [1, 2, 3], "b": [1, 2, 3]} if transform else {"a": [1, 2, 3]} - pandas.DataFrame(data).to_csv(file, index=False) + def test_issue_5889(self, transform, sort_last, tmp_path): + unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path) + data = {"a": [1, 2, 3], "b": [1, 2, 3]} if transform else {"a": [1, 2, 3]} + pandas.DataFrame(data).to_csv(unique_filename, index=False) - def test_concat(lib, **kwargs): - if transform: + def test_concat(lib, **kwargs): + if transform: - def read_csv(): - return lib.read_csv(file)["b"] + def read_csv(): + return lib.read_csv(unique_filename)["b"] - else: + else: - def read_csv(): - return lib.read_csv(file) + def read_csv(): + return lib.read_csv(unique_filename) - df = read_csv() - for _ in range(100): - df = lib.concat([df, read_csv()]) - if sort_last: - df = lib.concat([df, read_csv()], sort=True) - return df + df = read_csv() + for _ in range(100): + df = lib.concat([df, read_csv()]) + if sort_last: + df = lib.concat([df, read_csv()], sort=True) + return df - run_and_compare(test_concat, data={}) + run_and_compare(test_concat, data={}) class TestGroupby: diff --git a/modin/experimental/pandas/test/test_io_exp.py b/modin/experimental/pandas/test/test_io_exp.py index ed091dee10a..8e9ebdf0df7 100644 --- a/modin/experimental/pandas/test/test_io_exp.py +++ b/modin/experimental/pandas/test/test_io_exp.py @@ -18,13 +18,13 @@ import numpy as np import pandas import pytest -from pandas._testing import ensure_clean import modin.experimental.pandas as pd -from modin.config import AsyncReadMode, Engine +from modin.config import Engine from modin.pandas.test.utils import ( df_equals, eval_general, + get_unique_filename, parse_dates_values_by_id, test_data, time_parsing_csv_path, @@ -355,7 +355,7 @@ def test_xml_glob(tmp_path, filename): reason=f"{Engine.get()} does not have experimental read_custom_text API", ) @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True) -def test_read_custom_json_text(set_async_read_mode): +def test_read_custom_json_text(set_async_read_mode, tmp_path): def _generate_json(file_name, nrows, ncols): data = np.random.rand(nrows, ncols) df = pandas.DataFrame(data, columns=[f"col{x}" for x in range(ncols)]) @@ -374,25 +374,19 @@ def _custom_parser(io_input, **kwargs): result[key].append(obj[key]) return pandas.DataFrame(result).rename(columns={"col0": "testID"}) - with ensure_clean() as filename: - _generate_json(filename, 64, 8) + unique_filename = get_unique_filename(data_dir=tmp_path) + _generate_json(unique_filename, 64, 8) - df1 = pd.read_custom_text( - filename, - columns=["testID", "col1", "col3"], - custom_parser=_custom_parser, - is_quoting=False, - ) - df2 = pd.read_json(filename, lines=True)[["col0", "col1", "col3"]].rename( - columns={"col0": "testID"} - ) - if AsyncReadMode.get(): - # If read operations are asynchronous, then the dataframes - # check should be inside `ensure_clean` context - # because the file may be deleted before actual reading starts - df_equals(df1, df2) - if not AsyncReadMode.get(): - df_equals(df1, df2) + df1 = pd.read_custom_text( + unique_filename, + columns=["testID", "col1", "col3"], + custom_parser=_custom_parser, + is_quoting=False, + ) + df2 = pd.read_json(unique_filename, lines=True)[["col0", "col1", "col3"]].rename( + columns={"col0": "testID"} + ) + df_equals(df1, df2) @pytest.mark.skipif( @@ -400,7 +394,7 @@ def _custom_parser(io_input, **kwargs): reason=f"{Engine.get()} does not have experimental API", ) @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True) -def test_read_evaluated_dict(set_async_read_mode): +def test_read_evaluated_dict(set_async_read_mode, tmp_path): def _generate_evaluated_dict(file_name, nrows, ncols): result = {} keys = [f"col{x}" for x in range(ncols)] @@ -430,23 +424,17 @@ def columns_callback(io_input, **kwargs): break return columns - with ensure_clean() as filename: - _generate_evaluated_dict(filename, 64, 8) + unique_filename = get_unique_filename(data_dir=tmp_path) + _generate_evaluated_dict(unique_filename, 64, 8) - df1 = pd.read_custom_text( - filename, - columns=["col1", "col2"], - custom_parser=_custom_parser, - ) - assert df1.shape == (64, 2) + df1 = pd.read_custom_text( + unique_filename, + columns=["col1", "col2"], + custom_parser=_custom_parser, + ) + assert df1.shape == (64, 2) - df2 = pd.read_custom_text( - filename, columns=columns_callback, custom_parser=_custom_parser - ) - if AsyncReadMode.get(): - # If read operations are asynchronous, then the dataframes - # check should be inside `ensure_clean` context - # because the file may be deleted before actual reading starts - df_equals(df1, df2) - if not AsyncReadMode.get(): - df_equals(df1, df2) + df2 = pd.read_custom_text( + unique_filename, columns=columns_callback, custom_parser=_custom_parser + ) + df_equals(df1, df2) diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py index bf58b572168..0b165a391f5 100644 --- a/modin/pandas/test/dataframe/test_indexing.py +++ b/modin/pandas/test/dataframe/test_indexing.py @@ -17,7 +17,6 @@ import numpy as np import pandas import pytest -from pandas._testing import ensure_clean from pandas.testing import assert_index_equal import modin.pandas as pd @@ -35,6 +34,7 @@ df_equals, eval_general, generate_multiindex, + get_unique_filename, int_arg_keys, int_arg_values, name_contains, @@ -2207,14 +2207,16 @@ def test___setitem__partitions_aligning(): df_equals(md_df, pd_df) -def test___setitem__with_mismatched_partitions(): - with ensure_clean(".csv") as fname: - np.savetxt(fname, np.random.randint(0, 100, size=(200_000, 99)), delimiter=",") - modin_df = pd.read_csv(fname) - pandas_df = pandas.read_csv(fname) - modin_df["new"] = pd.Series(list(range(len(modin_df)))) - pandas_df["new"] = pandas.Series(list(range(len(pandas_df)))) - df_equals(modin_df, pandas_df) +def test___setitem__with_mismatched_partitions(tmp_path): + unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path) + np.savetxt( + unique_filename, np.random.randint(0, 100, size=(200_000, 99)), delimiter="," + ) + modin_df = pd.read_csv(unique_filename) + pandas_df = pandas.read_csv(unique_filename) + modin_df["new"] = pd.Series(list(range(len(modin_df)))) + pandas_df["new"] = pandas.Series(list(range(len(pandas_df)))) + df_equals(modin_df, pandas_df) def test___setitem__mask(): diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 09945c669d9..46ec1e36973 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -31,7 +31,6 @@ import pytest import sqlalchemy as sa from packaging import version -from pandas._testing import ensure_clean from pandas.errors import ParserWarning from scipy import sparse @@ -472,16 +471,16 @@ def test_read_csv_parsing_3( nrows=nrows, ) - def test_read_csv_skipinitialspace(self): - with ensure_clean(".csv") as unique_filename: - str_initial_spaces = ( - "col1,col2,col3,col4\n" - + "five, six, seven, eight\n" - + " five, six, seven, eight\n" - + "five, six, seven, eight\n" - ) + def test_read_csv_skipinitialspace(self, tmp_path): + unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path) + str_initial_spaces = ( + "col1,col2,col3,col4\n" + + "five, six, seven, eight\n" + + " five, six, seven, eight\n" + + "five, six, seven, eight\n" + ) - eval_io_from_str(str_initial_spaces, unique_filename, skipinitialspace=True) + eval_io_from_str(str_initial_spaces, unique_filename, skipinitialspace=True) # NA and Missing Data Handling tests @pytest.mark.parametrize("na_values", ["custom_nan", "73"]) @@ -552,17 +551,17 @@ def test_read_csv_datetime( @pytest.mark.parametrize("date", ["2023-01-01 00:00:01.000000000", "2023"]) @pytest.mark.parametrize("dtype", [None, "str", {"id": "int64"}]) @pytest.mark.parametrize("parse_dates", [None, [], ["date"], [1]]) - def test_read_csv_dtype_parse_dates(self, date, dtype, parse_dates): - with ensure_clean(".csv") as filename: - with open(filename, "w") as file: - file.write(f"id,date\n1,{date}") - eval_io( - fn_name="read_csv", - # read_csv kwargs - filepath_or_buffer=filename, - dtype=dtype, - parse_dates=parse_dates, - ) + def test_read_csv_dtype_parse_dates(self, date, dtype, parse_dates, tmp_path): + unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path) + with open(unique_filename, "w") as file: + file.write(f"id,date\n1,{date}") + eval_io( + fn_name="read_csv", + # read_csv kwargs + filepath_or_buffer=unique_filename, + dtype=dtype, + parse_dates=parse_dates, + ) # Iteration tests @pytest.mark.parametrize("iterator", [True, False]) @@ -865,13 +864,12 @@ def test_read_csv_internal( # Issue related, specific or corner cases @pytest.mark.parametrize("nrows", [2, None]) - def test_read_csv_bad_quotes(self, nrows): + def test_read_csv_bad_quotes(self, nrows, tmp_path): csv_bad_quotes = ( '1, 2, 3, 4\none, two, three, four\nfive, "six", seven, "eight\n' ) - - with ensure_clean(".csv") as unique_filename: - eval_io_from_str(csv_bad_quotes, unique_filename, nrows=nrows) + unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path) + eval_io_from_str(csv_bad_quotes, unique_filename, nrows=nrows) def test_read_csv_categories(self): eval_io( @@ -1247,20 +1245,13 @@ def test_to_csv_with_index(self, tmp_path): eval_to_csv_file(tmp_path, modin_df, pandas_df, "csv") @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True) - def test_read_csv_issue_5150(self, set_async_read_mode): - with ensure_clean(".csv") as unique_filename: - pandas_df = pandas.DataFrame(np.random.randint(0, 100, size=(2**6, 2**6))) - pandas_df.to_csv(unique_filename, index=False) - expected_pandas_df = pandas.read_csv(unique_filename, index_col=False) - modin_df = pd.read_csv(unique_filename, index_col=False) - actual_pandas_df = modin_df._to_pandas() - if AsyncReadMode.get(): - # If read operations are asynchronous, then the dataframes - # check should be inside `ensure_clean` context - # because the file may be deleted before actual reading starts - df_equals(expected_pandas_df, actual_pandas_df) - if not AsyncReadMode.get(): - df_equals(expected_pandas_df, actual_pandas_df) + def test_read_csv_issue_5150(self, set_async_read_mode, tmp_path): + unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path) + pandas_df = pandas.DataFrame(np.random.randint(0, 100, size=(2**6, 2**6))) + pandas_df.to_csv(unique_filename, index=False) + expected_pandas_df = pandas.read_csv(unique_filename, index_col=False) + modin_actual_df = pd.read_csv(unique_filename, index_col=False) + df_equals(expected_pandas_df, modin_actual_df) @pytest.mark.parametrize("usecols", [None, [0, 1, 2, 3, 4]]) def test_read_csv_1930(self, usecols): @@ -1424,18 +1415,18 @@ def comparator(df1, df2): ) # Tests issue #6778 - def test_read_parquet_no_extension(self, engine, make_parquet_file): - with ensure_clean(".parquet") as unique_filename: - # Remove the .parquet extension - no_ext_fname = unique_filename[: unique_filename.index(".parquet")] + def test_read_parquet_no_extension(self, engine, make_parquet_file, tmp_path): + unique_filename = get_unique_filename(extension="parquet", data_dir=tmp_path) + # Remove the .parquet extension + no_ext_fname = unique_filename[: unique_filename.index(".parquet")] - make_parquet_file(filename=no_ext_fname) - eval_io( - fn_name="read_parquet", - # read_parquet kwargs - engine=engine, - path=no_ext_fname, - ) + make_parquet_file(filename=no_ext_fname) + eval_io( + fn_name="read_parquet", + # read_parquet kwargs + engine=engine, + path=no_ext_fname, + ) @pytest.mark.parametrize( "filters", @@ -1762,7 +1753,7 @@ def test_read_parquet_partitioned_directory( ], ], ) - def test_read_parquet_pandas_index(self, engine, filters): + def test_read_parquet_pandas_index(self, engine, filters, tmp_path): if ( version.parse(pa.__version__) >= version.parse("12.0.0") and version.parse(pd.__version__) < version.parse("2.0.0") @@ -1815,26 +1806,28 @@ def test_read_parquet_pandas_index(self, engine, filters): ): continue - with ensure_clean(".parquet") as unique_filename: - pandas_df.set_index(col).to_parquet(unique_filename) - # read the same parquet using modin.pandas - eval_io( - "read_parquet", - # read_parquet kwargs - path=unique_filename, - engine=engine, - filters=filters, - ) - - with ensure_clean(".parquet") as unique_filename: - pandas_df.set_index(["idx", "A"]).to_parquet(unique_filename) - eval_io( - "read_parquet", - # read_parquet kwargs - path=unique_filename, - engine=engine, - filters=filters, - ) + unique_filename = get_unique_filename( + extension="parquet", data_dir=tmp_path + ) + pandas_df.set_index(col).to_parquet(unique_filename) + # read the same parquet using modin.pandas + eval_io( + "read_parquet", + # read_parquet kwargs + path=unique_filename, + engine=engine, + filters=filters, + ) + + unique_filename = get_unique_filename(extension="parquet", data_dir=tmp_path) + pandas_df.set_index(["idx", "A"]).to_parquet(unique_filename) + eval_io( + "read_parquet", + # read_parquet kwargs + path=unique_filename, + engine=engine, + filters=filters, + ) @pytest.mark.parametrize( "filters", @@ -2444,24 +2437,24 @@ def test_HDFStore(self, tmp_path): df_equals(modin_df, pandas_df) assert isinstance(modin_store, pd.HDFStore) - with ensure_clean(".hdf5") as hdf_file: - with pd.HDFStore(hdf_file, mode="w") as store: - store.append("data/df1", pd.DataFrame(np.random.randn(5, 5))) - store.append("data/df2", pd.DataFrame(np.random.randn(4, 4))) + unique_filename = get_unique_filename(extension="hdf5", data_dir=tmp_path) + with pd.HDFStore(unique_filename, mode="w") as store: + store.append("data/df1", pd.DataFrame(np.random.randn(5, 5))) + store.append("data/df2", pd.DataFrame(np.random.randn(4, 4))) - modin_df = pd.read_hdf(hdf_file, key="data/df1", mode="r") - pandas_df = pandas.read_hdf(hdf_file, key="data/df1", mode="r") + modin_df = pd.read_hdf(unique_filename, key="data/df1", mode="r") + pandas_df = pandas.read_hdf(unique_filename, key="data/df1", mode="r") df_equals(modin_df, pandas_df) - def test_HDFStore_in_read_hdf(self): - with ensure_clean(".hdf") as filename: - dfin = pd.DataFrame(np.random.rand(8, 8)) - dfin.to_hdf(filename, "/key") + def test_HDFStore_in_read_hdf(self, tmp_path): + unique_filename = get_unique_filename(extension="hdf", data_dir=tmp_path) + dfin = pd.DataFrame(np.random.rand(8, 8)) + dfin.to_hdf(unique_filename, "/key") - with pd.HDFStore(filename) as h: - modin_df = pd.read_hdf(h, "/key") - with pandas.HDFStore(filename) as h: - pandas_df = pandas.read_hdf(h, "/key") + with pd.HDFStore(unique_filename) as h: + modin_df = pd.read_hdf(h, "/key") + with pandas.HDFStore(unique_filename) as h: + pandas_df = pandas.read_hdf(h, "/key") df_equals(modin_df, pandas_df) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 55420b1d46f..3fb161860eb 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -1014,22 +1014,14 @@ def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs): unique_filename: str csv file name. """ - try: - with open(unique_filename, "w") as f: - f.write(csv_str) + with open(unique_filename, "w") as f: + f.write(csv_str) - eval_io( - filepath_or_buffer=unique_filename, - fn_name="read_csv", - **kwargs, - ) - - finally: - if os.path.exists(unique_filename): - try: - os.remove(unique_filename) - except PermissionError: - pass + eval_io( + filepath_or_buffer=unique_filename, + fn_name="read_csv", + **kwargs, + ) def create_test_dfs(*args, **kwargs):