From b9f4408b6eaaa018b0a2400d4c50a4ef71898148 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Thu, 22 Feb 2024 09:20:23 +0100 Subject: [PATCH 1/6] Repair spark queries --- queries/pyspark/executor.py | 14 ++++++++------ queries/pyspark/utils.py | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/queries/pyspark/executor.py b/queries/pyspark/executor.py index 4df4f77..f4fa8cb 100644 --- a/queries/pyspark/executor.py +++ b/queries/pyspark/executor.py @@ -1,7 +1,7 @@ from linetimer import CodeTimer # TODO: works for now, but need dynamic imports for this. -from pyspark import ( # noqa: F401 +from queries.pyspark import ( # noqa: F401 q1, q2, q3, @@ -30,9 +30,11 @@ num_queries = 22 with CodeTimer(name="Overall execution of ALL spark queries", unit="s"): - sub_modules = [f"q{sm}" for sm in range(1, num_queries + 1)] - for sm in sub_modules: + for query_number in range(1, num_queries + 1): + submodule = f"q{query_number}" try: - eval(f"{sm}.q()") - except Exception: - print(f"Exception occurred while executing spark_queries.{sm}") + eval(f"{submodule}.q()") + except Exception as exc: + print( + f"Exception occurred while executing PySpark query {query_number}:\n{exc}" + ) diff --git a/queries/pyspark/utils.py b/queries/pyspark/utils.py index e39ff0c..9a04946 100644 --- a/queries/pyspark/utils.py +++ b/queries/pyspark/utils.py @@ -28,8 +28,8 @@ def get_or_create_spark() -> SparkSession: return spark -def __read_parquet_ds(path: str, table_name: str) -> SparkDF: - df = get_or_create_spark().read.parquet(path) +def __read_parquet_ds(path: Path, table_name: str) -> SparkDF: + df = get_or_create_spark().read.parquet(str(path)) df.createOrReplaceTempView(table_name) return df From 88f452aa254cf23c8a86afd08feb8ba94a6bf18e Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Thu, 22 Feb 2024 10:05:32 +0100 Subject: [PATCH 2/6] Fix dask --- queries/dask/q7.py | 12 ++++++++---- queries/dask/utils.py | 9 ++++++--- queries/polars/utils.py | 4 ++-- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/queries/dask/q7.py b/queries/dask/q7.py index df8d54c..2ff433a 100644 --- a/queries/dask/q7.py +++ b/queries/dask/q7.py @@ -1,6 +1,9 @@ -import datetime +import warnings +from datetime import datetime -import dask.dataframe as dd +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=DeprecationWarning) + import dask.dataframe as dd from queries.dask import utils @@ -8,8 +11,9 @@ def q(): - var1 = datetime.strptime("1995-01-01", "%Y-%m-%d") - var2 = datetime.strptime("1997-01-01", "%Y-%m-%d") + var1 = datetime(1995, 1, 1) + var2 = datetime(1997, 1, 1) + nation_ds = utils.get_nation_ds customer_ds = utils.get_customer_ds line_item_ds = utils.get_line_item_ds diff --git a/queries/dask/utils.py b/queries/dask/utils.py index f36511f..d5631ab 100644 --- a/queries/dask/utils.py +++ b/queries/dask/utils.py @@ -30,12 +30,13 @@ def read_ds(path: str) -> Union: return dd.from_pandas(pd.read_parquet(path), npartitions=os.cpu_count()) -def get_query_answer(query: int, base_dir: Path = ANSWERS_BASE_DIR) -> dd.DataFrame: +def get_query_answer( + query: int, base_dir: Path = ANSWERS_BASE_DIR +) -> dd.DataFrame: answer_df = pd.read_csv( base_dir / f"q{query}.out", sep="|", parse_dates=True, - infer_datetime_format=True, ) return answer_df.rename(columns=lambda x: x.strip()) @@ -52,7 +53,9 @@ def test_results(q_num: int, result_df: pd.DataFrame): s1 = s1.astype("string").apply(lambda x: x.strip()) s2 = s2.astype("string").apply(lambda x: x.strip()) - pd.testing.assert_series_equal(left=s1, right=s2, check_index=False) + pd.testing.assert_series_equal( + left=s1, right=s2, check_index=False, check_dtype=False + ) @on_second_call diff --git a/queries/polars/utils.py b/queries/polars/utils.py index afdb2f3..9bae1c7 100644 --- a/queries/polars/utils.py +++ b/queries/polars/utils.py @@ -35,9 +35,9 @@ def _scan_ds(path: Path): def get_query_answer( - query: int, base_dir: str = ANSWERS_PARQUET_BASE_DIR + query: int, base_dir: Path = ANSWERS_PARQUET_BASE_DIR ) -> pl.LazyFrame: - return pl.scan_parquet(Path(base_dir) / f"q{query}.parquet") + return pl.scan_parquet(base_dir / f"q{query}.parquet") def test_results(q_num: int, result_df: pl.DataFrame): From 4bd4f716c485e94148da4656ffa1e4102ae79724 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Thu, 22 Feb 2024 13:22:20 +0100 Subject: [PATCH 3/6] Fix pandas --- queries/pandas/q8.py | 2 +- queries/pandas/utils.py | 32 +++++++++++++++++--------------- scripts/prepare_data.py | 2 ++ 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/queries/pandas/q8.py b/queries/pandas/q8.py index d009a70..eab0d6f 100644 --- a/queries/pandas/q8.py +++ b/queries/pandas/q8.py @@ -89,7 +89,7 @@ def udf(df): numerator = df["volume"].sum() return round(numerator / demonimator, 2) - total = total.groupby("o_year", as_index=False).apply(udf) + total = total.groupby("o_year", as_index=False).apply(udf, include_groups=False) total.columns = ["o_year", "mkt_share"] total = total.sort_values(by=["o_year"], ascending=[True]) diff --git a/queries/pandas/utils.py b/queries/pandas/utils.py index fc29f4c..0fead0a 100644 --- a/queries/pandas/utils.py +++ b/queries/pandas/utils.py @@ -4,10 +4,12 @@ import pandas as pd from linetimer import CodeTimer, linetimer +from pandas.api.types import is_string_dtype from pandas.core.frame import DataFrame as PandasDF +from pandas.testing import assert_series_equal from queries.common_utils import ( - ANSWERS_BASE_DIR, + ANSWERS_PARQUET_BASE_DIR, DATASET_BASE_DIR, FILE_TYPE, LOG_TIMINGS, @@ -16,26 +18,23 @@ on_second_call, ) +pd.options.mode.copy_on_write = True + def _read_ds(path: Path) -> PandasDF: path = f"{path}.{FILE_TYPE}" if FILE_TYPE == "parquet": - return pd.read_parquet(path, dtype_backend="pyarrow", engine="pyarrow") + return pd.read_parquet(path, dtype_backend="pyarrow") elif FILE_TYPE == "feather": - return pd.read_feather(path) + return pd.read_feather(path, dtype_backend="pyarrow") else: msg = f"file type: {FILE_TYPE} not expected" raise ValueError(msg) -def get_query_answer(query: int, base_dir: str = ANSWERS_BASE_DIR) -> PandasDF: - answer_df = pd.read_csv( - Path(base_dir) / f"q{query}.out", - sep="|", - parse_dates=True, - infer_datetime_format=True, - ) - return answer_df.rename(columns=lambda x: x.strip()) +def get_query_answer(query: int, base_dir: str = ANSWERS_PARQUET_BASE_DIR) -> PandasDF: + path = base_dir / f"q{query}.parquet" + return pd.read_parquet(path, dtype_backend="pyarrow") def test_results(q_num: int, result_df: PandasDF): @@ -46,11 +45,14 @@ def test_results(q_num: int, result_df: PandasDF): s1 = result_df[c] s2 = answer[c] - if t.name == "object": - s1 = s1.astype("string").apply(lambda x: x.strip()) - s2 = s2.astype("string").apply(lambda x: x.strip()) + if is_string_dtype(t): + s1 = s1.apply(lambda x: x.strip()) + + # TODO: Remove this cast + if s2.dtype == "date32[day][pyarrow]": + s2 = s2.astype("timestamp[us][pyarrow]") - pd.testing.assert_series_equal(left=s1, right=s2, check_index=False) + assert_series_equal(left=s1, right=s2, check_index=False, check_dtype=False) @on_second_call diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py index 7f34aeb..986402e 100644 --- a/scripts/prepare_data.py +++ b/scripts/prepare_data.py @@ -105,6 +105,8 @@ try_parse_dates=True, new_columns=eval(f"h_{name}"), ) + + # TODO: Remove this cast lf = lf.with_columns(pl.col(pl.Date).cast(pl.Datetime)) lf.sink_parquet(TABLES_DIR / f"{name}.parquet") From bc10f1d076619ec7b560b48e5438f3f70fca71c8 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Thu, 22 Feb 2024 13:35:30 +0100 Subject: [PATCH 4/6] Fix modin --- queries/dask/utils.py | 19 +++++++------------ queries/duckdb/utils.py | 9 +++++---- queries/modin/q3.py | 4 ++-- queries/modin/q4.py | 6 +++--- queries/modin/q5.py | 6 +++--- queries/modin/q6.py | 6 +++--- queries/modin/q7.py | 8 +++++--- queries/pyspark/utils.py | 12 ++++-------- requirements.txt | 2 +- 9 files changed, 33 insertions(+), 39 deletions(-) diff --git a/queries/dask/utils.py b/queries/dask/utils.py index d5631ab..94e29fa 100644 --- a/queries/dask/utils.py +++ b/queries/dask/utils.py @@ -7,9 +7,10 @@ import dask.dataframe as dd import pandas as pd from linetimer import CodeTimer, linetimer +from pandas.testing import assert_series_equal from queries.common_utils import ( - ANSWERS_BASE_DIR, + ANSWERS_PARQUET_BASE_DIR, DATASET_BASE_DIR, FILE_TYPE, INCLUDE_IO, @@ -31,14 +32,10 @@ def read_ds(path: str) -> Union: def get_query_answer( - query: int, base_dir: Path = ANSWERS_BASE_DIR -) -> dd.DataFrame: - answer_df = pd.read_csv( - base_dir / f"q{query}.out", - sep="|", - parse_dates=True, - ) - return answer_df.rename(columns=lambda x: x.strip()) + query: int, base_dir: str = ANSWERS_PARQUET_BASE_DIR +) -> pd.DataFrame: + path = base_dir / f"q{query}.parquet" + return pd.read_parquet(path) def test_results(q_num: int, result_df: pd.DataFrame): @@ -53,9 +50,7 @@ def test_results(q_num: int, result_df: pd.DataFrame): s1 = s1.astype("string").apply(lambda x: x.strip()) s2 = s2.astype("string").apply(lambda x: x.strip()) - pd.testing.assert_series_equal( - left=s1, right=s2, check_index=False, check_dtype=False - ) + assert_series_equal(left=s1, right=s2, check_index=False, check_dtype=False) @on_second_call diff --git a/queries/duckdb/utils.py b/queries/duckdb/utils.py index d3aff18..9047f74 100644 --- a/queries/duckdb/utils.py +++ b/queries/duckdb/utils.py @@ -6,7 +6,7 @@ import polars as pl from duckdb import DuckDBPyRelation from linetimer import CodeTimer, linetimer -from polars import testing as pl_test +from polars.testing import assert_frame_equal from queries.common_utils import ( ANSWERS_PARQUET_BASE_DIR, @@ -41,15 +41,16 @@ def _scan_ds(path: Path): def get_query_answer( - query: int, base_dir: str = ANSWERS_PARQUET_BASE_DIR + query: int, base_dir: Path = ANSWERS_PARQUET_BASE_DIR ) -> pl.LazyFrame: - return pl.scan_parquet(Path(base_dir) / f"q{query}.parquet") + path = base_dir / f"q{query}.parquet" + return pl.scan_parquet(path) def test_results(q_num: int, result_df: pl.DataFrame): with CodeTimer(name=f"Testing result of duckdb Query {q_num}", unit="s"): answer = get_query_answer(q_num).collect() - pl_test.assert_frame_equal(left=result_df, right=answer, check_dtype=False) + assert_frame_equal(left=result_df, right=answer, check_dtype=False) def get_line_item_ds(base_dir: str = DATASET_BASE_DIR) -> str: diff --git a/queries/modin/q3.py b/queries/modin/q3.py index f5816e1..e1ddd7b 100644 --- a/queries/modin/q3.py +++ b/queries/modin/q3.py @@ -1,4 +1,4 @@ -import datetime +from datetime import datetime from queries.modin import utils @@ -6,7 +6,7 @@ def q(): - var1 = var2 = datetime.datetime.strptime("1995-03-15", "%Y-%m-%d") + var1 = var2 = datetime(1995, 3, 15) var3 = "BUILDING" customer_ds = utils.get_customer_ds diff --git a/queries/modin/q4.py b/queries/modin/q4.py index d2705b0..947126a 100644 --- a/queries/modin/q4.py +++ b/queries/modin/q4.py @@ -1,4 +1,4 @@ -import datetime +from datetime import datetime from queries.modin import utils @@ -6,8 +6,8 @@ def q(): - date1 = datetime.datetime.strptime("1993-10-01", "%Y-%m-%d") - date2 = datetime.datetime.strptime("1993-07-01", "%Y-%m-%d") + date1 = datetime(1993, 10, 1) + date2 = datetime(1993, 7, 1) line_item_ds = utils.get_line_item_ds orders_ds = utils.get_orders_ds diff --git a/queries/modin/q5.py b/queries/modin/q5.py index 7dcab47..2e37f2c 100644 --- a/queries/modin/q5.py +++ b/queries/modin/q5.py @@ -1,4 +1,4 @@ -import datetime +from datetime import datetime from queries.modin import utils @@ -6,8 +6,8 @@ def q(): - date1 = datetime.datetime.strptime("1994-01-01", "%Y-%m-%d") - date2 = datetime.datetime.strptime("1995-01-01", "%Y-%m-%d") + date1 = datetime(1994, 1, 1) + date2 = datetime(1995, 1, 1) region_ds = utils.get_region_ds nation_ds = utils.get_nation_ds diff --git a/queries/modin/q6.py b/queries/modin/q6.py index f53300d..fa5da7d 100644 --- a/queries/modin/q6.py +++ b/queries/modin/q6.py @@ -1,4 +1,4 @@ -import datetime +from datetime import datetime import modin.pandas as pd @@ -8,8 +8,8 @@ def q(): - date1 = datetime.datetime.strptime("1994-01-01", "%Y-%m-%d") - date2 = datetime.datetime.strptime("1995-01-01", "%Y-%m-%d") + date1 = datetime(1994, 1, 1) + date2 = datetime(1995, 1, 1) var3 = 24 line_item_ds = utils.get_line_item_ds diff --git a/queries/modin/q7.py b/queries/modin/q7.py index f68c47c..97a3426 100644 --- a/queries/modin/q7.py +++ b/queries/modin/q7.py @@ -1,4 +1,4 @@ -import datetime +from datetime import datetime import modin.pandas as pd @@ -8,6 +8,9 @@ def q(): + var1 = datetime(1995, 1, 1) + var2 = datetime(1997, 1, 1) + nation_ds = utils.get_nation_ds customer_ds = utils.get_customer_ds line_item_ds = utils.get_line_item_ds @@ -35,8 +38,7 @@ def query(): supplier_ds = supplier_ds() lineitem_filtered = line_item_ds[ - (line_item_ds["l_shipdate"] >= datetime.strptime("1995-01-01", "%Y-%m-%d")) - & (line_item_ds["l_shipdate"] < datetime.strptime("1997-01-01", "%Y-%m-%d")) + (line_item_ds["l_shipdate"] >= var1) & (line_item_ds["l_shipdate"] < var2) ] lineitem_filtered["l_year"] = lineitem_filtered["l_shipdate"].dt.year lineitem_filtered["revenue"] = lineitem_filtered["l_extendedprice"] * ( diff --git a/queries/pyspark/utils.py b/queries/pyspark/utils.py index 9a04946..d3d7eac 100644 --- a/queries/pyspark/utils.py +++ b/queries/pyspark/utils.py @@ -7,7 +7,7 @@ from pyspark.sql import SparkSession from queries.common_utils import ( - ANSWERS_BASE_DIR, + ANSWERS_PARQUET_BASE_DIR, DATASET_BASE_DIR, LOG_TIMINGS, SHOW_RESULTS, @@ -34,15 +34,11 @@ def __read_parquet_ds(path: Path, table_name: str) -> SparkDF: return df -def get_query_answer(query: int, base_dir: Path = ANSWERS_BASE_DIR) -> PandasDF: +def get_query_answer(query: int, base_dir: str = ANSWERS_PARQUET_BASE_DIR) -> PandasDF: import pandas as pd - answer_df = pd.read_csv( - base_dir / f"q{query}.out", - sep="|", - parse_dates=True, - ) - return answer_df.rename(columns=lambda x: x.strip()) + path = base_dir / f"q{query}.parquet" + return pd.read_parquet(path) def test_results(q_num: int, result_df: PandasDF): diff --git a/requirements.txt b/requirements.txt index 71e2d73..f7cf942 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,4 @@ pyspark # TODO: Enable optional ray dependency for modin when ray supports Python 3.12 # https://github.com/ray-project/ray/issues/40211 # modin[ray] -modin +modin[dask] From 6e1062fa160c7fa1ad54a885c28424dab09adf61 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Thu, 22 Feb 2024 13:40:51 +0100 Subject: [PATCH 5/6] Rename answers --- queries/common_utils.py | 3 +-- queries/dask/utils.py | 6 ++---- queries/duckdb/utils.py | 6 ++---- queries/modin/utils.py | 11 +++-------- queries/pandas/utils.py | 4 ++-- queries/polars/utils.py | 6 ++---- queries/pyspark/utils.py | 4 ++-- 7 files changed, 14 insertions(+), 26 deletions(-) diff --git a/queries/common_utils.py b/queries/common_utils.py index 756ced6..2693029 100644 --- a/queries/common_utils.py +++ b/queries/common_utils.py @@ -23,8 +23,7 @@ CWD = Path(__file__).parent ROOT = CWD.parent DATASET_BASE_DIR = ROOT / "data" / "tables" / f"scale-{SCALE_FACTOR}" -ANSWERS_BASE_DIR = ROOT / "tpch-dbgen/answers" -ANSWERS_PARQUET_BASE_DIR = ROOT / "data" / "answers" +ANSWERS_BASE_DIR = ROOT / "data" / "answers" TIMINGS_FILE = ROOT / os.environ.get("TIMINGS_FILE", "timings.csv") DEFAULT_PLOTS_DIR = ROOT / "plots" diff --git a/queries/dask/utils.py b/queries/dask/utils.py index 94e29fa..910882f 100644 --- a/queries/dask/utils.py +++ b/queries/dask/utils.py @@ -10,7 +10,7 @@ from pandas.testing import assert_series_equal from queries.common_utils import ( - ANSWERS_PARQUET_BASE_DIR, + ANSWERS_BASE_DIR, DATASET_BASE_DIR, FILE_TYPE, INCLUDE_IO, @@ -31,9 +31,7 @@ def read_ds(path: str) -> Union: return dd.from_pandas(pd.read_parquet(path), npartitions=os.cpu_count()) -def get_query_answer( - query: int, base_dir: str = ANSWERS_PARQUET_BASE_DIR -) -> pd.DataFrame: +def get_query_answer(query: int, base_dir: str = ANSWERS_BASE_DIR) -> pd.DataFrame: path = base_dir / f"q{query}.parquet" return pd.read_parquet(path) diff --git a/queries/duckdb/utils.py b/queries/duckdb/utils.py index 9047f74..0469d74 100644 --- a/queries/duckdb/utils.py +++ b/queries/duckdb/utils.py @@ -9,7 +9,7 @@ from polars.testing import assert_frame_equal from queries.common_utils import ( - ANSWERS_PARQUET_BASE_DIR, + ANSWERS_BASE_DIR, DATASET_BASE_DIR, FILE_TYPE, INCLUDE_IO, @@ -40,9 +40,7 @@ def _scan_ds(path: Path): return path -def get_query_answer( - query: int, base_dir: Path = ANSWERS_PARQUET_BASE_DIR -) -> pl.LazyFrame: +def get_query_answer(query: int, base_dir: Path = ANSWERS_BASE_DIR) -> pl.LazyFrame: path = base_dir / f"q{query}.parquet" return pl.scan_parquet(path) diff --git a/queries/modin/utils.py b/queries/modin/utils.py index 4746e0a..1b56ca9 100644 --- a/queries/modin/utils.py +++ b/queries/modin/utils.py @@ -21,16 +21,11 @@ def __read_parquet_ds(path: str) -> PandasDF: return pd.read_parquet(path, dtype_backend="pyarrow", engine="pyarrow") -def get_query_answer(query: int, base_dir: Path = ANSWERS_BASE_DIR) -> PandasDF: +def get_query_answer(query: int, base_dir: str = ANSWERS_BASE_DIR) -> PandasDF: import pandas as pd - answer_df = pd.read_csv( - base_dir / f"q{query}.out", - sep="|", - parse_dates=True, - infer_datetime_format=True, - ) - return answer_df.rename(columns=lambda x: x.strip()) + path = base_dir / f"q{query}.parquet" + return pd.read_parquet(path) def test_results(q_num: int, result_df: PandasDF): diff --git a/queries/pandas/utils.py b/queries/pandas/utils.py index 0fead0a..13efe19 100644 --- a/queries/pandas/utils.py +++ b/queries/pandas/utils.py @@ -9,7 +9,7 @@ from pandas.testing import assert_series_equal from queries.common_utils import ( - ANSWERS_PARQUET_BASE_DIR, + ANSWERS_BASE_DIR, DATASET_BASE_DIR, FILE_TYPE, LOG_TIMINGS, @@ -32,7 +32,7 @@ def _read_ds(path: Path) -> PandasDF: raise ValueError(msg) -def get_query_answer(query: int, base_dir: str = ANSWERS_PARQUET_BASE_DIR) -> PandasDF: +def get_query_answer(query: int, base_dir: str = ANSWERS_BASE_DIR) -> PandasDF: path = base_dir / f"q{query}.parquet" return pd.read_parquet(path, dtype_backend="pyarrow") diff --git a/queries/polars/utils.py b/queries/polars/utils.py index 9bae1c7..15c3f71 100644 --- a/queries/polars/utils.py +++ b/queries/polars/utils.py @@ -7,7 +7,7 @@ from polars.testing import assert_frame_equal from queries.common_utils import ( - ANSWERS_PARQUET_BASE_DIR, + ANSWERS_BASE_DIR, DATASET_BASE_DIR, FILE_TYPE, INCLUDE_IO, @@ -34,9 +34,7 @@ def _scan_ds(path: Path): return scan.collect().rechunk().lazy() -def get_query_answer( - query: int, base_dir: Path = ANSWERS_PARQUET_BASE_DIR -) -> pl.LazyFrame: +def get_query_answer(query: int, base_dir: Path = ANSWERS_BASE_DIR) -> pl.LazyFrame: return pl.scan_parquet(base_dir / f"q{query}.parquet") diff --git a/queries/pyspark/utils.py b/queries/pyspark/utils.py index d3d7eac..78e7b5c 100644 --- a/queries/pyspark/utils.py +++ b/queries/pyspark/utils.py @@ -7,7 +7,7 @@ from pyspark.sql import SparkSession from queries.common_utils import ( - ANSWERS_PARQUET_BASE_DIR, + ANSWERS_BASE_DIR, DATASET_BASE_DIR, LOG_TIMINGS, SHOW_RESULTS, @@ -34,7 +34,7 @@ def __read_parquet_ds(path: Path, table_name: str) -> SparkDF: return df -def get_query_answer(query: int, base_dir: str = ANSWERS_PARQUET_BASE_DIR) -> PandasDF: +def get_query_answer(query: int, base_dir: str = ANSWERS_BASE_DIR) -> PandasDF: import pandas as pd path = base_dir / f"q{query}.parquet" From 4c7576a4bc9f086a906d6c84ed3eb736ab9e06d7 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Thu, 22 Feb 2024 13:59:36 +0100 Subject: [PATCH 6/6] Formatting --- queries/dask/q3.py | 4 ++-- queries/dask/q4.py | 4 ++-- queries/dask/q5.py | 4 ++-- queries/dask/q6.py | 6 +++--- queries/pandas/q5.py | 6 +++--- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/queries/dask/q3.py b/queries/dask/q3.py index f780fd7..edc9567 100644 --- a/queries/dask/q3.py +++ b/queries/dask/q3.py @@ -1,4 +1,4 @@ -import datetime +from datetime import datetime from queries.dask import utils @@ -6,7 +6,7 @@ def q(): - var1 = datetime.datetime.strptime("1995-03-15", "%Y-%m-%d") + var1 = datetime(1995, 3, 15) var2 = "BUILDING" line_item_ds = utils.get_line_item_ds diff --git a/queries/dask/q4.py b/queries/dask/q4.py index 9307e5d..3a4dd91 100644 --- a/queries/dask/q4.py +++ b/queries/dask/q4.py @@ -6,8 +6,8 @@ def q(): - date1 = datetime.strptime("1993-10-01", "%Y-%m-%d") - date2 = datetime.strptime("1993-07-01", "%Y-%m-%d") + date1 = datetime(1993, 10, 1) + date2 = datetime(1993, 7, 1) line_item_ds = utils.get_line_item_ds orders_ds = utils.get_orders_ds diff --git a/queries/dask/q5.py b/queries/dask/q5.py index c2349e8..167e4f0 100644 --- a/queries/dask/q5.py +++ b/queries/dask/q5.py @@ -6,8 +6,8 @@ def q(): - date1 = datetime.datetime.strptime("1994-01-01", "%Y-%m-%d") - date2 = datetime.datetime.strptime("1995-01-01", "%Y-%m-%d") + date1 = datetime(1994, 1, 1) + date2 = datetime(1995, 1, 1) region_ds = utils.get_region_ds nation_ds = utils.get_nation_ds diff --git a/queries/dask/q6.py b/queries/dask/q6.py index b7c96d6..00388e4 100644 --- a/queries/dask/q6.py +++ b/queries/dask/q6.py @@ -1,4 +1,4 @@ -import datetime +from datetime import datetime import pandas as pd @@ -8,8 +8,8 @@ def q(): - date1 = datetime.datetime.strptime("1994-01-01", "%Y-%m-%d") - date2 = datetime.datetime.strptime("1995-01-01", "%Y-%m-%d") + date1 = datetime(1994, 1, 1) + date2 = datetime(1995, 1, 1) var3 = 24 line_item_ds = utils.get_line_item_ds diff --git a/queries/pandas/q5.py b/queries/pandas/q5.py index 6c0084f..62574d5 100644 --- a/queries/pandas/q5.py +++ b/queries/pandas/q5.py @@ -1,4 +1,4 @@ -import datetime +from datetime import datetime from queries.pandas import utils @@ -6,8 +6,8 @@ def q(): - date1 = datetime.datetime.strptime("1994-01-01", "%Y-%m-%d") - date2 = datetime.datetime.strptime("1995-01-01", "%Y-%m-%d") + date1 = datetime(1994, 1, 1) + date2 = datetime(1995, 1, 1) region_ds = utils.get_region_ds nation_ds = utils.get_nation_ds