pola-rs · stinodego · Apr 23, 2024 · Apr 21, 2024 · Apr 21, 2024 · Apr 21, 2024
@@ -48,33 +48,6 @@ jobs:
         working-directory: py-polars
         run: pip install -r requirements-dev.txt
 
-      - name: Load benchmark data from cache
-        id: cache-data
-        uses: actions/cache/restore@v4
-        with:
-          path: py-polars/tests/benchmark/data/G1_1e7_1e2_5_0.csv
-          key: benchmark-data
-
-      - name: Set up R
-        if: steps.cache-data.outputs.cache-hit != 'true'
-        uses: r-lib/actions/setup-r@v2
-        with:
-          r-version: '4.3.3'
-
-      - name: Generate data
-        if: steps.cache-data.outputs.cache-hit != 'true'
-        working-directory: py-polars/tests/benchmark/data
-        run: |
-          Rscript -e 'install.packages("data.table", repos="https://cloud.r-project.org")'
-          Rscript groupby-datagen.R 1e7 1e2 5 0
-
-      - name: Save benchmark data in cache
-        if: github.ref_name == 'main'
-        uses: actions/cache/save@v4
-        with:
-          path: py-polars/tests/benchmark/data/G1_1e7_1e2_5_0.csv
-          key: ${{ steps.cache-data.outputs.cache-primary-key }}
-
       - name: Set up Rust
         run: rustup show
 

@@ -113,14 +113,14 @@ jobs:
 
       - name: Run Python tests
         working-directory: py-polars
-        run: pytest --cov -n auto --dist loadgroup -m "not benchmark and not docs" --cov-report xml:main.xml
+        run: pytest --cov -n auto --dist loadgroup -m "not release and not benchmark and not docs" --cov-report xml:main.xml
         continue-on-error: true
 
       - name: Run Python tests - async reader
         working-directory: py-polars
         env:
           POLARS_FORCE_ASYNC: 1
-        run: pytest --cov -m "not benchmark and not docs" tests/unit/io/ --cov-report xml:async.xml
+        run: pytest --cov -m "not release and not benchmark and not docs" tests/unit/io/ --cov-report xml:async.xml
         continue-on-error: true
 
       - name: Report Rust coverage

@@ -89,13 +89,13 @@ jobs:
           # Currently skipped due to performance issues in coverage:
           # https://github.com/nedbat/coveragepy/issues/1665
           COV: ${{ !(matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12') && '--cov' || '--no-cov' }}
-        run: pytest $COV -n auto --dist loadgroup -m "not benchmark and not docs"
+        run: pytest $COV -n auto --dist loadgroup -m "not release and not benchmark and not docs"
 
       - name: Run tests async reader tests
         if: github.ref_name != 'main' && matrix.os != 'windows-latest'
         env:
           POLARS_FORCE_ASYNC: 1
-        run: pytest -m "not benchmark and not docs" tests/unit/io/
+        run: pytest -m "not release and not benchmark and not docs" tests/unit/io/
 
       - name: Check import without optional dependencies
         if: github.ref_name != 'main' && matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'

@@ -102,12 +102,10 @@ Polars uses [CodSpeed](https://codspeed.io/pola-rs/polars) for tracking the perf
 
 ### Generating data
 
-For many tests, a relatively large dataset must be generated first.
-We use an [R](https://www.r-project.org/) script to generate this data.
-The script was taken from the [H2O AI database benchmark](https://github.com/h2oai/db-benchmark), which is the foundation for many of the benchmark tests.
+For most tests, a relatively large dataset must be generated first.
+This is done as part of the `pytest` setup process.
 
-For the exact steps to generate the data, please refer to the [benchmark workflow](https://github.com/pola-rs/polars/blob/main/.github/workflows/benchmark.yml).
-It involves [installing R](https://cran.r-project.org/), installing the [data.table](https://cran.r-project.org/web/packages/data.table/) dependency, and executing a data generation script.
+The data generation logic was taken from the [H2O.ai database benchmark](https://github.com/h2oai/db-benchmark), which is the foundation for many of the benchmark tests.
 
 ### Running the benchmark tests
 

@@ -100,7 +100,7 @@ test-all: .venv build  ## Run all tests
 
 .PHONY: coverage
 coverage: .venv build  ## Run tests and report coverage
-	$(VENV_BIN)/pytest --cov -n auto --dist loadgroup -m "not benchmark"
+	$(VENV_BIN)/pytest --cov -n auto --dist loadgroup -m "not release and not benchmark"
 
 .PHONY: clean
 clean:  ## Clean up caches and build artifacts

@@ -206,12 +206,13 @@ addopts = [
   "--strict-markers",
   "--import-mode=importlib",
   # Default to running fast tests only. To run ALL tests, run: pytest -m ""
-  "-m not slow and not write_disk and not benchmark and not hypothesis and not docs",
+  "-m not slow and not write_disk and not release and not docs and not hypothesis and not benchmark",
 ]
 markers = [
   "slow: Tests with a longer than average runtime.",
   "write_disk: Tests that write to disk",
   "debug: Tests that should be run on a Polars debug build.",
+  "release: Tests that should be run on a Polars release build.",
   "docs: Documentation code snippets",
 ]
 filterwarnings = [

@@ -1,7 +1,8 @@
 """
 Benchmark tests.
 
-These tests are skipped by default as a large dataset must be generated first.
+These tests are skipped by default as a relatively large dataset must be generated
+first.
 
 See the documentation on how to run these tests:
 https://docs.pola.rs/development/contributing/test/#benchmark-tests

@@ -1,34 +1,9 @@
-from pathlib import Path
-
 import pytest
 
 import polars as pl
+from tests.benchmark.datagen_groupby import generate_group_by_data
 
 
-@pytest.fixture(scope="module")
-def data_path() -> Path:
-    return Path(__file__).parent / "data"
-
-
-@pytest.fixture(scope="module")
-def h2aoi_groupby_data_path(data_path: Path) -> Path:
-    return data_path / "G1_1e7_1e2_5_0.csv"
-
-
-@pytest.fixture(scope="module")
-def h2oai_groupby_data(h2aoi_groupby_data_path: Path) -> pl.DataFrame:
-    if not h2aoi_groupby_data_path.is_file():
-        pytest.skip("Dataset must be generated before running this test.")
-
-    df = pl.read_csv(
-        h2aoi_groupby_data_path,
-        dtypes={
-            "id4": pl.Int32,
-            "id5": pl.Int32,
-            "id6": pl.Int32,
-            "v1": pl.Int32,
-            "v2": pl.Int32,
-            "v3": pl.Float64,
-        },
-    )
-    return df
+@pytest.fixture(scope="session")
+def groupby_data() -> pl.DataFrame:
+    return generate_group_by_data(10_000, 100, null_ratio=0.05)