Skip to content

Commit

Permalink
Fix more typing
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego committed Feb 22, 2024
1 parent 38a6eae commit ad89881
Show file tree
Hide file tree
Showing 30 changed files with 112 additions and 70 deletions.
8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ docstring-code-format = true

[tool.mypy]
files = ["queries", "scripts"]
# strict = true
strict = true
enable_error_code = [
"redundant-expr",
"truthy-bool",
Expand All @@ -79,3 +79,9 @@ module = [
"plotly.*",
]
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = [
"plotnine.*",
]
follow_imports = "skip"
2 changes: 1 addition & 1 deletion queries/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def helper(*args: Any, **kwargs: Any) -> Any:
return helper


def execute_all(solution: str):
def execute_all(solution: str) -> None:
package_name = f"{solution}"

expr = re.compile(r"q(\d+).py$")
Expand Down
2 changes: 1 addition & 1 deletion queries/dask/q1.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def query() -> pd.DataFrame:
total.compute().reset_index().sort_values(["l_returnflag", "l_linestatus"])
)

return result_df
return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)

Expand Down
2 changes: 1 addition & 1 deletion queries/dask/q2.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def query() -> pd.DataFrame:
],
)[:100]

return result_df
return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)

Expand Down
2 changes: 1 addition & 1 deletion queries/dask/q3.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def query() -> pd.DataFrame:
:, ["l_orderkey", "revenue", "o_orderdate", "o_shippriority"]
]

return result_df
return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)

Expand Down
2 changes: 1 addition & 1 deletion queries/dask/q4.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def query() -> pd.DataFrame:
.sort_values(["o_orderpriority"])
)
result_df = result_df.compute()
return result_df.rename({"o_orderkey": "order_count"}, axis=1)
return result_df.rename({"o_orderkey": "order_count"}, axis=1) # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)

Expand Down
2 changes: 1 addition & 1 deletion queries/dask/q5.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def query() -> pd.DataFrame:
jn5["revenue"] = jn5.l_extendedprice * (1.0 - jn5.l_discount)
gb = jn5.groupby("n_name")["revenue"].sum()
result_df = gb.compute().reset_index().sort_values("revenue", ascending=False)
return result_df
return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)

Expand Down
2 changes: 1 addition & 1 deletion queries/dask/q7.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def query() -> pd.DataFrame:
by=["supp_nation", "cust_nation", "l_year"],
ascending=[True, True, True],
)
return result_df
return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)

Expand Down
32 changes: 17 additions & 15 deletions queries/dask/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
import timeit
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

import dask.dataframe as dd
import pandas as pd
Expand All @@ -24,23 +24,25 @@
from collections.abc import Callable
from pathlib import Path

from dask.dataframe.core import DataFrame

def read_ds(path: Path) -> dd.DataFrame:

def read_ds(path: Path) -> DataFrame:
if INCLUDE_IO:
return dd.read_parquet(path)
return dd.read_parquet(path) # type: ignore[attr-defined,no-any-return]
if FILE_TYPE == "feather":
msg = "file type feather not supported for dask queries"
raise ValueError(msg)

return dd.from_pandas(pd.read_parquet(path), npartitions=os.cpu_count())
return dd.from_pandas(pd.read_parquet(path), npartitions=os.cpu_count()) # type: ignore[attr-defined,no-any-return]


def get_query_answer(query: int, base_dir: Path = ANSWERS_BASE_DIR) -> pd.DataFrame:
path = base_dir / f"q{query}.parquet"
return pd.read_parquet(path)


def test_results(q_num: int, result_df: pd.DataFrame):
def test_results(q_num: int, result_df: pd.DataFrame) -> None:
with CodeTimer(name=f"Testing result of dask Query {q_num}", unit="s"):
answer = get_query_answer(q_num)

Expand All @@ -56,47 +58,47 @@ def test_results(q_num: int, result_df: pd.DataFrame):


@on_second_call
def get_line_item_ds(base_dir: Path = DATASET_BASE_DIR) -> dd.DataFrame:
def get_line_item_ds(base_dir: Path = DATASET_BASE_DIR) -> DataFrame:
return read_ds(base_dir / "lineitem.parquet")


@on_second_call
def get_orders_ds(base_dir: Path = DATASET_BASE_DIR) -> dd.DataFrame:
def get_orders_ds(base_dir: Path = DATASET_BASE_DIR) -> DataFrame:
return read_ds(base_dir / "orders.parquet")


@on_second_call
def get_customer_ds(base_dir: Path = DATASET_BASE_DIR) -> dd.DataFrame:
def get_customer_ds(base_dir: Path = DATASET_BASE_DIR) -> DataFrame:
return read_ds(base_dir / "customer.parquet")


@on_second_call
def get_region_ds(base_dir: Path = DATASET_BASE_DIR) -> dd.DataFrame:
def get_region_ds(base_dir: Path = DATASET_BASE_DIR) -> DataFrame:
return read_ds(base_dir / "region.parquet")


@on_second_call
def get_nation_ds(base_dir: Path = DATASET_BASE_DIR) -> dd.DataFrame:
def get_nation_ds(base_dir: Path = DATASET_BASE_DIR) -> DataFrame:
return read_ds(base_dir / "nation.parquet")


@on_second_call
def get_supplier_ds(base_dir: Path = DATASET_BASE_DIR) -> dd.DataFrame:
def get_supplier_ds(base_dir: Path = DATASET_BASE_DIR) -> DataFrame:
return read_ds(base_dir / "supplier.parquet")


@on_second_call
def get_part_ds(base_dir: Path = DATASET_BASE_DIR) -> dd.DataFrame:
def get_part_ds(base_dir: Path = DATASET_BASE_DIR) -> DataFrame:
return read_ds(base_dir / "part.parquet")


@on_second_call
def get_part_supp_ds(base_dir: Path = DATASET_BASE_DIR) -> dd.DataFrame:
def get_part_supp_ds(base_dir: Path = DATASET_BASE_DIR) -> DataFrame:
return read_ds(base_dir / "partsupp.parquet")


def run_query(q_num: int, query: Callable) -> None:
@linetimer(name=f"Overall execution of dask Query {q_num}", unit="s")
def run_query(q_num: int, query: Callable[..., Any]) -> None:
@linetimer(name=f"Overall execution of dask Query {q_num}", unit="s") # type: ignore[misc]
def run() -> None:
import dask

Expand Down
2 changes: 1 addition & 1 deletion queries/duckdb/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def get_part_supp_ds(base_dir: Path = DATASET_BASE_DIR) -> str:


def run_query(q_num: int, context: DuckDBPyRelation) -> None:
@linetimer(name=f"Overall execution of duckdb Query {q_num}", unit="s")
@linetimer(name=f"Overall execution of duckdb Query {q_num}", unit="s") # type: ignore[misc]
def query() -> None:
with CodeTimer(name=f"Get result of duckdb Query {q_num}", unit="s"):
t0 = timeit.default_timer()
Expand Down
10 changes: 8 additions & 2 deletions queries/modin/q1.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
from __future__ import annotations

from datetime import datetime
from typing import TYPE_CHECKING

from queries.modin import utils

if TYPE_CHECKING:
import pandas as pd

Q_NUM = 1


Expand All @@ -12,7 +18,7 @@ def q() -> None:
# first call one time to cache in case we don't include the IO times
lineitem()

def query():
def query() -> pd.DataFrame:
nonlocal lineitem
lineitem = lineitem()

Expand Down Expand Up @@ -62,7 +68,7 @@ def query():

result_df = total.reset_index().sort_values(["l_returnflag", "l_linestatus"])

return result_df
return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)

Expand Down
11 changes: 9 additions & 2 deletions queries/modin/q2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from queries.modin import utils

if TYPE_CHECKING:
import pandas as pd

Q_NUM = 2


Expand All @@ -21,7 +28,7 @@ def q() -> None:
part_ds()
part_supp_ds()

def query():
def query() -> pd.DataFrame:
nonlocal region_ds
nonlocal nation_ds
nonlocal supplier_ds
Expand Down Expand Up @@ -150,7 +157,7 @@ def query():
],
)[:100]

return result_df
return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)

Expand Down
10 changes: 8 additions & 2 deletions queries/modin/q3.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
from __future__ import annotations

from datetime import datetime
from typing import TYPE_CHECKING

from queries.modin import utils

if TYPE_CHECKING:
import pandas as pd

Q_NUM = 3


Expand All @@ -18,7 +24,7 @@ def q() -> None:
line_item_ds()
orders_ds()

def query():
def query() -> pd.DataFrame:
nonlocal customer_ds
nonlocal line_item_ds
nonlocal orders_ds
Expand Down Expand Up @@ -53,7 +59,7 @@ def query():
result_df = total[:10].loc[
:, ["l_orderkey", "revenue", "o_orderdate", "o_shippriority"]
]
return result_df
return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)

Expand Down
10 changes: 8 additions & 2 deletions queries/modin/q4.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
from __future__ import annotations

from datetime import datetime
from typing import TYPE_CHECKING

from queries.modin import utils

if TYPE_CHECKING:
import pandas as pd

Q_NUM = 4


Expand All @@ -16,7 +22,7 @@ def q() -> None:
line_item_ds()
orders_ds()

def query():
def query() -> pd.DataFrame:
nonlocal line_item_ds
nonlocal orders_ds
line_item_ds = line_item_ds()
Expand All @@ -33,7 +39,7 @@ def query():
.sort_values(["o_orderpriority"])
.rename(columns={"o_orderkey": "order_count"})
)
return result_df
return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)

Expand Down
10 changes: 8 additions & 2 deletions queries/modin/q5.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
from __future__ import annotations

from datetime import datetime
from typing import TYPE_CHECKING

from queries.modin import utils

if TYPE_CHECKING:
import pandas as pd

Q_NUM = 5


Expand All @@ -24,7 +30,7 @@ def q() -> None:
orders_ds()
supplier_ds()

def query():
def query() -> pd.DataFrame:
nonlocal region_ds
nonlocal nation_ds
nonlocal customer_ds
Expand Down Expand Up @@ -55,7 +61,7 @@ def query():
jn5["revenue"] = jn5.l_extendedprice * (1.0 - jn5.l_discount)
gb = jn5.groupby("n_name", as_index=False)["revenue"].sum()
result_df = gb.sort_values("revenue", ascending=False)
return result_df
return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)

Expand Down
4 changes: 3 additions & 1 deletion queries/modin/q6.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

from datetime import datetime

import modin.pandas as pd
Expand All @@ -17,7 +19,7 @@ def q() -> None:
# first call one time to cache in case we don't include the IO times
line_item_ds()

def query():
def query() -> pd.DataFrame:
nonlocal line_item_ds
line_item_ds = line_item_ds()

Expand Down
10 changes: 4 additions & 6 deletions queries/modin/q7.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

from datetime import datetime

import modin.pandas as pd
Expand All @@ -24,7 +26,7 @@ def q() -> None:
orders_ds()
supplier_ds()

def query():
def query() -> pd.DataFrame:
nonlocal nation_ds
nonlocal customer_ds
nonlocal line_item_ds
Expand Down Expand Up @@ -124,11 +126,7 @@ def query():

result_df = result_df.sort_values(
by=["supp_nation", "cust_nation", "l_year"],
ascending=[
True,
True,
True,
],
ascending=[True, True, True],
)
return result_df

Expand Down
Loading

0 comments on commit ad89881

Please sign in to comment.