unified data sizes for all benchmarks

helmholtz-analytics · Oct 7, 2024 · 3fd76cb · 3fd76cb
1 parent 5fc46fa
commit 3fd76cb
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 36 deletions.
diff --git a/benchmarks/cb/linalg.py b/benchmarks/cb/linalg.py
@@ -2,7 +2,7 @@
 import heat as ht
 from mpi4py import MPI
 from perun import monitor
-from sizes import GSIZE_TS_L, GSIZE_TS_S, GSIZE_SQ
+from sizes import GSIZE_TS_L, GSIZE_TS_S, GSIZE_SQ, LANCZOS_SIZE
 
 """
 Benchmarks in this file:
@@ -99,8 +99,7 @@ def run_linalg_benchmarks():
     qr_split_1(a_1)
     del a_1
 
-    n = 1000
-    A = ht.random.random((n, n), dtype=ht.float64, split=0)
+    A = ht.random.random((LANCZOS_SIZE, LANCZOS_SIZE), dtype=ht.float64, split=0)
     B = A @ A.T
     lanczos(B)
     del A, B

diff --git a/benchmarks/cb/manipulations.py b/benchmarks/cb/manipulations.py
@@ -2,49 +2,72 @@
 import heat as ht
 from typing import List
 from perun import monitor
+from sizes import GSIZE_SQ, GSIZE_CB
+
+"""
+Bencharks so far:
+- concatenation along split axis
+- reshaping along split axis with new_split
+- resplitting (of a split array)
+- unsplit a split array
+"""
 
 
 @monitor()
 def concatenate(arrays):
-    # benchmark concatenation of 3 arrays with split 1, None, 1 respectively
     a = ht.concatenate(arrays, axis=1)
 
 
 @monitor()
-def reshape(arrays):
-    for array in arrays:
-        a = ht.reshape(array, (10000000, -1), new_split=1)
+def concatenate_nosplit(arrays):
+    a = ht.concatenate(arrays, axis=1)
+
+
+@monitor()
+def reshape(array):
+    a = ht.reshape(array, (array.shape[0] * array.shape[1], -1), new_split=1)
+
+
+@monitor()
+def reshape_nosplit(array):
+    a = ht.reshape(array, (array.shape[0] * array.shape[1], -1), new_split=1)
 
 
 @monitor()
-def resplit(array, new_split: List[int | None]):
-    for new_split in new_split:
-        a = ht.resplit(array, axis=new_split)
-        del a
+def resplit(array):
+    a = ht.resplit(array, axis=1)
+
+
+@monitor()
+def unsplit(array):
+    a = ht.resplit(array, axis=None)
 
 
 def run_manipulation_benchmarks():
-    sizes = [10000, 20000, 40000]
-    arrays = []
-    for size in sizes:
-        arrays.append(ht.zeros((1000, size), split=1))
+    arrays = [
+        ht.zeros((GSIZE_SQ // 2, GSIZE_SQ), split=1),
+        ht.zeros((GSIZE_SQ // 2, GSIZE_SQ), split=1),
+    ]
+    concatenate(arrays)
+    del arrays
+
+    arrays = [
+        ht.zeros((GSIZE_SQ // 2, GSIZE_SQ), split=0),
+        ht.zeros((GSIZE_SQ // 2, GSIZE_SQ), split=0),
+    ]
+    concatenate_nosplit(arrays)
+    del arrays
+
+    array = ht.zeros((GSIZE_CB, GSIZE_CB, GSIZE_CB), split=0)
     reshape(arrays)
+    del array
 
-    arrays = []
-    for i, size in enumerate(sizes):
-        if i == 1:
-            split = None
-        else:
-            split = 1
-        arrays.append(ht.zeros((1000, size), split=split))
-    concatenate(arrays)
+    array = ht.zeros((GSIZE_CB, GSIZE_CB, GSIZE_CB), split=2)
+    reshape_nosplit(arrays)
+    del array
 
-    if ht.comm.size > 1:
-        shape = [100, 50, 50, 20, 86]
-        n_elements = ht.array(shape).prod().item()
-        mem = n_elements * 4 / 1e9
-        array = ht.reshape(ht.arange(0, n_elements, split=0, dtype=ht.float32), shape) * (
-            ht.comm.rank + 1
-        )
+    array = ht.ones((GSIZE_SQ, GSIZE_SQ), split=0)
+    resplit(array)
 
-        resplit(array, [None, 2, 4])
+    array = ht.ones((GSIZE_SQ, GSIZE_SQ), split=0)
+    unsplit(array)
diff --git a/benchmarks/cb/preprocessing.py b/benchmarks/cb/preprocessing.py
@@ -2,9 +2,17 @@
 import heat as ht
 from mpi4py import MPI
 from perun import monitor
+from sizes import GSIZE_TS_L, GSIZE_TS_S
 
-# we benchmark the in-place versions (`copy=False`) of the preprocessing functions
-# for each function, both the forward and the inverse transformation are applied
+"""
+Benchmarks in this file:
+- StandardScaler and inverse_transform
+- MinMaxScaler and inverse_transform
+- MaxAbsScaler and inverse_transform
+- RobustScaler and inverse_transform
+- Normalizer (without inverse, of course)
+All of them are both fit_transform and inverse_transform (together); data is split along the data axis.
+"""
 
 
 @monitor()
@@ -42,8 +50,8 @@ def apply_inplace_normalizer(X):
 
 
 def run_preprocessing_benchmarks():
-    n_data_points = 5000
-    n_features = 50
+    n_data_points = GSIZE_TS_L
+    n_features = GSIZE_TS_S
     X = ht.random.randn(n_data_points, n_features, split=0)
 
     apply_inplace_standard_scaler_and_inverse(X)

diff --git a/benchmarks/cb/sizes.py b/benchmarks/cb/sizes.py
@@ -10,7 +10,14 @@
 TS_FACTOR_loc = 2
 vTS_FACTOR_loc = 4
 
-# all other variables are calculated based on the number of elements per process
+"""
+all other variables are calculated based on the number of elements per process
+shape of a 2D square array: (GSIZE_SQ, GSIZE_SQ)
+shape of a 3D cube array: (GSIZE_CB, GSIZE_CB, GSIZE_CB)
+shape of a 2D tall-skinny array: (GSIZE_TS_L, GSIZE_TS_S)
+shape of a 2D very tall-skinny array: (GSIZE_vTS_L, GSIZE_vTS_S)
+similar for short-fat and very short-fat arrays...
+"""
 n_procs = ht.MPI_WORLD.size
 N_ELEMENTS_TOTAL = N_ELEMENTS_PER_PROC * n_procs
 
@@ -26,3 +33,10 @@
     (N_ELEMENTS_TOTAL / vTS_FACTOR_GLOB) ** 0.5
 )  # short dimension of very tall-skinny matrix
 GSIZE_vTS_L = GSIZE_TS_S * vTS_FACTOR_GLOB + 1  # long dimension of very tall-skinny matrix
+
+GSIZE_CB = int(N_ELEMENTS_TOTAL ** (1 / 3))  # dimension of a cube array
+
+"""
+Exceptions needed for the moment:
+"""
+LANCZOS_SIZE = 2**10