-
Notifications
You must be signed in to change notification settings - Fork 41
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d4532d7
commit bab1673
Showing
4 changed files
with
189 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# M5 Forecasting Feature Engineering | ||
|
||
The [M5 Forecasting Competition](https://www.sciencedirect.com/science/article/pii/S0169207021001874) was held on Kaggle in 2020, | ||
and top solutions generally featured a lot of heavy feature engineering. | ||
|
||
Participants typically used pandas (Polars was only just getting started at the time), so here we benchmark how long it have | ||
taken to do the same feature engineering with Polars (and, coming soon, DuckDB). | ||
|
||
We believe this to be a useful task to benchmark, because: | ||
|
||
- the competition was run on real-world Walmart data | ||
- the operations we're benchmarking are from the winning solution, so evidently they were doing something right | ||
|
||
The original code can be found here: https://github.com/Mcompetitions/M5-methods. We run part of the prepocessing | ||
functions from the top solution "A1". The code is generally kept as-is, with some minor modifications to deal | ||
with pandas syntax updates which happened in the last 2 years. | ||
|
||
## Running the benchmark | ||
|
||
**Data**: download the output files from https://www.kaggle.com/code/marcogorelli/winning-solution-preprocessing. | ||
Place them in a `data` folder here. | ||
|
||
**Run**: | ||
|
||
- `python pandas_queries.py` | ||
- `python polars_queries.py` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# coming soon ;) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import os | ||
import time | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import pyarrow | ||
|
||
print("pandas version", pd.__version__) | ||
print("numpy version", np.__version__) | ||
print("pyarrow version", pyarrow.__version__) | ||
|
||
pd.options.mode.copy_on_write = True | ||
pd.options.future.infer_string = True | ||
|
||
PROCESSED_DATA_DIR = "data" | ||
|
||
TARGET = "sales" | ||
SHIFT_DAY = 28 | ||
|
||
# Set this to True if you just want to test that everything runs | ||
SMALL = True | ||
if SMALL: | ||
PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1_small.parquet") | ||
else: | ||
PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1.parquet") | ||
|
||
LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)] | ||
|
||
|
||
def q1_pandas(df): | ||
return df.assign( | ||
**{ | ||
f"{TARGET}_lag_{l}": df.groupby(["id"], observed=True)[TARGET].transform( | ||
lambda x: x.shift(l) | ||
) | ||
for l in LAG_DAYS | ||
} | ||
) | ||
|
||
|
||
def q2_pandas(df): | ||
for i in [7, 14, 30, 60, 180]: | ||
df["rolling_mean_" + str(i)] = df.groupby(["id"], observed=True)[ | ||
TARGET | ||
].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean()) | ||
for i in [7, 14, 30, 60, 180]: | ||
df["rolling_std_" + str(i)] = df.groupby(["id"], observed=True)[ | ||
TARGET | ||
].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()) | ||
return df | ||
|
||
|
||
def q3_pandas(df): | ||
for d_shift in [1, 7, 14]: | ||
for d_window in [7, 14, 30, 60]: | ||
col_name = "rolling_mean_" + str(d_shift) + "_" + str(d_window) | ||
df[col_name] = df.groupby(["id"], observed=True)[TARGET].transform( | ||
lambda x: x.shift(d_shift).rolling(d_window).mean() | ||
) | ||
return df | ||
|
||
|
||
start_time = time.perf_counter() | ||
q1_pandas(pd.read_parquet(PATH, engine="pyarrow")) | ||
print(f"q1 took: {time.perf_counter() - start_time}") | ||
|
||
start_time = time.perf_counter() | ||
q2_pandas(pd.read_parquet(PATH, engine="pyarrow")) | ||
print(f"q2 took: {time.perf_counter() - start_time}") | ||
|
||
start_time = time.perf_counter() | ||
q3_pandas(pd.read_parquet(PATH, engine="pyarrow")) | ||
print(f"q2 took: {time.perf_counter() - start_time}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import os | ||
import time | ||
|
||
import polars as pl | ||
|
||
print("polars version", pl.__version__) | ||
|
||
PROCESSED_DATA_DIR = "data" | ||
|
||
TARGET = "sales" | ||
SHIFT_DAY = 28 | ||
|
||
# Set this to True if you just want to test that everything runs | ||
SMALL = True | ||
if SMALL: | ||
PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1_small.parquet") | ||
else: | ||
PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1.parquet") | ||
|
||
LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)] | ||
|
||
|
||
def q1_polars(df): | ||
return df.with_columns( | ||
pl.col(TARGET).shift(l).over("id").alias(f"{TARGET}_lag_{l}") for l in LAG_DAYS | ||
) | ||
|
||
|
||
def q2_polars(df): | ||
return df.with_columns( | ||
*[ | ||
pl.col(TARGET) | ||
.shift(SHIFT_DAY) | ||
.rolling_mean(window_size=i) | ||
.over("id") | ||
.alias(f"rolling_mean_{i}") | ||
for i in [7, 14, 30, 60, 180] | ||
], | ||
*[ | ||
pl.col(TARGET) | ||
.shift(SHIFT_DAY) | ||
.rolling_std(window_size=i) | ||
.over("id") | ||
.alias(f"rolling_std_{i}") | ||
for i in [7, 14, 30, 60, 180] | ||
], | ||
) | ||
|
||
|
||
def q3_polars(df): | ||
return df.with_columns( | ||
pl.col(TARGET) | ||
.shift(d_shift) | ||
.rolling_mean(window_size=d_window) | ||
.over("id") | ||
.alias(f"rolling_mean_{d_shift}_{d_window}") | ||
for d_shift in [1, 7, 14] | ||
for d_window in [7, 14, 30, 60] | ||
) | ||
|
||
|
||
print("*** polars lazy ***") | ||
|
||
start_time = time.perf_counter() | ||
q1_polars(pl.scan_parquet(PATH)).collect() | ||
print(f"q1 took: {time.perf_counter() - start_time}") | ||
|
||
start_time = time.perf_counter() | ||
q2_polars(pl.scan_parquet(PATH)).collect() | ||
print(f"q2 took: {time.perf_counter() - start_time}") | ||
|
||
start_time = time.perf_counter() | ||
q3_polars(pl.scan_parquet(PATH)).collect() | ||
print(f"q2 took: {time.perf_counter() - start_time}") | ||
|
||
print("*** polars eager ***") | ||
|
||
start_time = time.perf_counter() | ||
q1_polars(pl.read_parquet(PATH)) | ||
print(f"q1 took: {time.perf_counter() - start_time}") | ||
|
||
start_time = time.perf_counter() | ||
q2_polars(pl.read_parquet(PATH)) | ||
print(f"q2 took: {time.perf_counter() - start_time}") | ||
|
||
start_time = time.perf_counter() | ||
q3_polars(pl.read_parquet(PATH)) | ||
print(f"q2 took: {time.perf_counter() - start_time}") |