Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable ruff linter #77

Merged
merged 5 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ jobs:
VERSION=$(grep -m 1 -oP 'ruff==\K(.*)' requirements.txt)
echo "version=$VERSION" >> $GITHUB_OUTPUT

# - uses: chartboost/ruff-action@v1
# with:
# version: ${{ steps.version.outputs.version }}
# args: check --no-fix
- uses: chartboost/ruff-action@v1
with:
version: ${{ steps.version.outputs.version }}
args: check --no-fix

- uses: chartboost/ruff-action@v1
with:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ perf.*
target/
**/flamegraph.svg
Cargo.lock
.ruff_cache/
16 changes: 9 additions & 7 deletions common_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import re
import sys
from pathlib import Path
from subprocess import run

from linetimer import CodeTimer
Expand All @@ -17,16 +18,17 @@
print("log timings:", LOG_TIMINGS)
print("file type:", FILE_TYPE)

CWD = os.path.dirname(os.path.realpath(__file__))
DATASET_BASE_DIR = os.path.join(CWD, f"tables_scale_{SCALE_FACTOR}")
ANSWERS_BASE_DIR = os.path.join(CWD, "tpch-dbgen/answers")
ANSWERS_PARQUET_BASE_DIR = os.path.join(CWD, "data/answers")
TIMINGS_FILE = os.path.join(CWD, os.environ.get("TIMINGS_FILE", "timings.csv"))
DEFAULT_PLOTS_DIR = os.path.join(CWD, "plots")

CWD = Path(__file__).parent
DATASET_BASE_DIR = CWD / f"tables_scale_{SCALE_FACTOR}"
ANSWERS_BASE_DIR = CWD / "tpch-dbgen/answers"
ANSWERS_PARQUET_BASE_DIR = CWD / "data/answers"
TIMINGS_FILE = CWD / os.environ.get("TIMINGS_FILE", "timings.csv")
DEFAULT_PLOTS_DIR = CWD / "plots"


def append_row(solution: str, q: str, secs: float, version: str, success=True):
with open(TIMINGS_FILE, "a") as f:
with TIMINGS_FILE.open("a") as f:
if f.tell() == 0:
f.write("solution,version,query_no,duration[s],include_io,success\n")
f.write(f"{solution},{version},{q},{secs},{INCLUDE_IO},{success}\n")
Expand Down
2 changes: 1 addition & 1 deletion dask_queries/q1.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import date, datetime
from datetime import datetime

from dask_queries import utils

Expand Down
5 changes: 4 additions & 1 deletion dask_queries/q4.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ def query():
flineitem = line_item_ds[lsel]
forders = orders_ds[osel]
forders = forders[["o_orderkey", "o_orderpriority"]]
# jn = forders[forders["o_orderkey"].compute().isin(flineitem["l_orderkey"])] # doesn't support isin

# doesn't support isin
# jn = forders[forders["o_orderkey"].compute().isin(flineitem["l_orderkey"])]

jn = forders.merge(
flineitem, left_on="o_orderkey", right_on="l_orderkey"
).drop_duplicates(subset=["o_orderkey"])[["o_orderpriority", "o_orderkey"]]
Expand Down
1 change: 0 additions & 1 deletion dask_queries/q7.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import datetime
from datetime import datetime

import dask.dataframe as dd

Expand Down
28 changes: 15 additions & 13 deletions dask_queries/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os
import timeit
from os.path import join
from typing import Callable, Union
from collections.abc import Callable
from pathlib import Path
from typing import Union

import dask.dataframe as dd
import pandas as pd
Expand All @@ -23,14 +24,15 @@ def read_ds(path: str) -> Union:
if INCLUDE_IO:
return dd.read_parquet(path)
if FILE_TYPE == "feather":
raise ValueError("file type feather not supported for dask queries")
msg = "file type feather not supported for dask queries"
raise ValueError(msg)

return dd.from_pandas(pd.read_parquet(path), npartitions=os.cpu_count())


def get_query_answer(query: int, base_dir: str = ANSWERS_BASE_DIR) -> dd.DataFrame:
def get_query_answer(query: int, base_dir: Path = ANSWERS_BASE_DIR) -> dd.DataFrame:
answer_df = pd.read_csv(
join(base_dir, f"q{query}.out"),
base_dir / f"q{query}.out",
sep="|",
parse_dates=True,
infer_datetime_format=True,
Expand All @@ -55,42 +57,42 @@ def test_results(q_num: int, result_df: pd.DataFrame):

@on_second_call
def get_line_item_ds(base_dir: str = DATASET_BASE_DIR) -> dd.DataFrame:
return read_ds(join(base_dir, "lineitem.parquet"))
return read_ds(Path(base_dir) / "lineitem.parquet")


@on_second_call
def get_orders_ds(base_dir: str = DATASET_BASE_DIR) -> dd.DataFrame:
return read_ds(join(base_dir, "orders.parquet"))
return read_ds(Path(base_dir) / "orders.parquet")


@on_second_call
def get_customer_ds(base_dir: str = DATASET_BASE_DIR) -> dd.DataFrame:
return read_ds(join(base_dir, "customer.parquet"))
return read_ds(Path(base_dir) / "customer.parquet")


@on_second_call
def get_region_ds(base_dir: str = DATASET_BASE_DIR) -> dd.DataFrame:
return read_ds(join(base_dir, "region.parquet"))
return read_ds(Path(base_dir) / "region.parquet")


@on_second_call
def get_nation_ds(base_dir: str = DATASET_BASE_DIR) -> dd.DataFrame:
return read_ds(join(base_dir, "nation.parquet"))
return read_ds(Path(base_dir) / "nation.parquet")


@on_second_call
def get_supplier_ds(base_dir: str = DATASET_BASE_DIR) -> dd.DataFrame:
return read_ds(join(base_dir, "supplier.parquet"))
return read_ds(Path(base_dir) / "supplier.parquet")


@on_second_call
def get_part_ds(base_dir: str = DATASET_BASE_DIR) -> dd.DataFrame:
return read_ds(join(base_dir, "part.parquet"))
return read_ds(Path(base_dir) / "part.parquet")


@on_second_call
def get_part_supp_ds(base_dir: str = DATASET_BASE_DIR) -> dd.DataFrame:
return read_ds(join(base_dir, "partsupp.parquet"))
return read_ds(Path(base_dir) / "partsupp.parquet")


def run_query(q_num: str, query: Callable):
Expand Down
1 change: 0 additions & 1 deletion duckdb_queries/q15.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import duckdb
from duckdb import DuckDBPyConnection

from duckdb_queries import utils

Expand Down
2 changes: 1 addition & 1 deletion duckdb_queries/q2.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def q():
{supplier_ds},
{part_supp_ds},
{nation_ds},
{region_ds}
{region_ds}
where
p_partkey = ps_partkey
and s_suppkey = ps_suppkey
Expand Down
23 changes: 11 additions & 12 deletions duckdb_queries/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import timeit
from importlib.metadata import version
from os.path import join
from typing import Any
from pathlib import Path

import duckdb
import polars as pl
Expand All @@ -20,7 +19,7 @@
)


def _scan_ds(path: str):
def _scan_ds(path: Path):
path = f"{path}.{FILE_TYPE}"
if FILE_TYPE == "parquet":
if INCLUDE_IO:
Expand All @@ -44,7 +43,7 @@ def _scan_ds(path: str):
def get_query_answer(
query: int, base_dir: str = ANSWERS_PARQUET_BASE_DIR
) -> pl.LazyFrame:
return pl.scan_parquet(join(base_dir, f"q{query}.parquet"))
return pl.scan_parquet(Path(base_dir) / f"q{query}.parquet")


def test_results(q_num: int, result_df: pl.DataFrame):
Expand All @@ -54,35 +53,35 @@ def test_results(q_num: int, result_df: pl.DataFrame):


def get_line_item_ds(base_dir: str = DATASET_BASE_DIR) -> str:
return _scan_ds(join(base_dir, "lineitem"))
return _scan_ds(Path(base_dir) / "lineitem")


def get_orders_ds(base_dir: str = DATASET_BASE_DIR) -> str:
return _scan_ds(join(base_dir, "orders"))
return _scan_ds(Path(base_dir) / "orders")


def get_customer_ds(base_dir: str = DATASET_BASE_DIR) -> str:
return _scan_ds(join(base_dir, "customer"))
return _scan_ds(Path(base_dir) / "customer")


def get_region_ds(base_dir: str = DATASET_BASE_DIR) -> str:
return _scan_ds(join(base_dir, "region"))
return _scan_ds(Path(base_dir) / "region")


def get_nation_ds(base_dir: str = DATASET_BASE_DIR) -> str:
return _scan_ds(join(base_dir, "nation"))
return _scan_ds(Path(base_dir) / "nation")


def get_supplier_ds(base_dir: str = DATASET_BASE_DIR) -> str:
return _scan_ds(join(base_dir, "supplier"))
return _scan_ds(Path(base_dir) / "supplier")


def get_part_ds(base_dir: str = DATASET_BASE_DIR) -> str:
return _scan_ds(join(base_dir, "part"))
return _scan_ds(Path(base_dir) / "part")


def get_part_supp_ds(base_dir: str = DATASET_BASE_DIR) -> str:
return _scan_ds(join(base_dir, "partsupp"))
return _scan_ds(Path(base_dir) / "partsupp")


def run_query(q_num: int, context: DuckDBPyRelation):
Expand Down
1 change: 0 additions & 1 deletion modin_queries/q7.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import datetime
from datetime import datetime

import modin.pandas as pd

Expand Down
25 changes: 12 additions & 13 deletions modin_queries/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import timeit
from os.path import join
from typing import Callable
from collections.abc import Callable
from pathlib import Path

import modin
import modin.pandas as pd
import pandas
from linetimer import CodeTimer, linetimer
from pandas.core.frame import DataFrame as PandasDF

Expand All @@ -22,11 +21,11 @@ def __read_parquet_ds(path: str) -> PandasDF:
return pd.read_parquet(path, dtype_backend="pyarrow", engine="pyarrow")


def get_query_answer(query: int, base_dir: str = ANSWERS_BASE_DIR) -> PandasDF:
def get_query_answer(query: int, base_dir: Path = ANSWERS_BASE_DIR) -> PandasDF:
import pandas as pd

answer_df = pd.read_csv(
join(base_dir, f"q{query}.out"),
base_dir / f"q{query}.out",
sep="|",
parse_dates=True,
infer_datetime_format=True,
Expand All @@ -53,42 +52,42 @@ def test_results(q_num: int, result_df: PandasDF):

@on_second_call
def get_line_item_ds(base_dir: str = DATASET_BASE_DIR) -> PandasDF:
return __read_parquet_ds(join(base_dir, "lineitem.parquet"))
return __read_parquet_ds(Path(base_dir) / "lineitem.parquet")


@on_second_call
def get_orders_ds(base_dir: str = DATASET_BASE_DIR) -> PandasDF:
return __read_parquet_ds(join(base_dir, "orders.parquet"))
return __read_parquet_ds(Path(base_dir) / "orders.parquet")


@on_second_call
def get_customer_ds(base_dir: str = DATASET_BASE_DIR) -> PandasDF:
return __read_parquet_ds(join(base_dir, "customer.parquet"))
return __read_parquet_ds(Path(base_dir) / "customer.parquet")


@on_second_call
def get_region_ds(base_dir: str = DATASET_BASE_DIR) -> PandasDF:
return __read_parquet_ds(join(base_dir, "region.parquet"))
return __read_parquet_ds(Path(base_dir) / "region.parquet")


@on_second_call
def get_nation_ds(base_dir: str = DATASET_BASE_DIR) -> PandasDF:
return __read_parquet_ds(join(base_dir, "nation.parquet"))
return __read_parquet_ds(Path(base_dir) / "nation.parquet")


@on_second_call
def get_supplier_ds(base_dir: str = DATASET_BASE_DIR) -> PandasDF:
return __read_parquet_ds(join(base_dir, "supplier.parquet"))
return __read_parquet_ds(Path(base_dir) / "supplier.parquet")


@on_second_call
def get_part_ds(base_dir: str = DATASET_BASE_DIR) -> PandasDF:
return __read_parquet_ds(join(base_dir, "part.parquet"))
return __read_parquet_ds(Path(base_dir) / "part.parquet")


@on_second_call
def get_part_supp_ds(base_dir: str = DATASET_BASE_DIR) -> PandasDF:
return __read_parquet_ds(join(base_dir, "partsupp.parquet"))
return __read_parquet_ds(Path(base_dir) / "partsupp.parquet")


def run_query(q_num: int, query: Callable):
Expand Down
2 changes: 0 additions & 2 deletions pandas_queries/q8.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from datetime import datetime

import pandas as pd

from pandas_queries import utils
Expand Down
Loading