From 3fd76cb7a1f020f19e00280a4b1345893304b15e Mon Sep 17 00:00:00 2001
From: Hoppe <mrhf92@gmail.com>
Date: Mon, 7 Oct 2024 18:12:17 +0200
Subject: [PATCH] unified data sizes for all benchmarks

---
 benchmarks/cb/linalg.py        |  5 +--
 benchmarks/cb/manipulations.py | 79 ++++++++++++++++++++++------------
 benchmarks/cb/preprocessing.py | 16 +++++--
 benchmarks/cb/sizes.py         | 16 ++++++-
 4 files changed, 80 insertions(+), 36 deletions(-)

diff --git a/benchmarks/cb/linalg.py b/benchmarks/cb/linalg.py
index e9c08220f8..0cb393da63 100644
--- a/benchmarks/cb/linalg.py
+++ b/benchmarks/cb/linalg.py
@@ -2,7 +2,7 @@
 import heat as ht
 from mpi4py import MPI
 from perun import monitor
-from sizes import GSIZE_TS_L, GSIZE_TS_S, GSIZE_SQ
+from sizes import GSIZE_TS_L, GSIZE_TS_S, GSIZE_SQ, LANCZOS_SIZE
 
 """
 Benchmarks in this file:
@@ -99,8 +99,7 @@ def run_linalg_benchmarks():
     qr_split_1(a_1)
     del a_1
 
-    n = 1000
-    A = ht.random.random((n, n), dtype=ht.float64, split=0)
+    A = ht.random.random((LANCZOS_SIZE, LANCZOS_SIZE), dtype=ht.float64, split=0)
     B = A @ A.T
     lanczos(B)
     del A, B
diff --git a/benchmarks/cb/manipulations.py b/benchmarks/cb/manipulations.py
index 0fde87cff9..72adc9446a 100644
--- a/benchmarks/cb/manipulations.py
+++ b/benchmarks/cb/manipulations.py
@@ -2,49 +2,72 @@
 import heat as ht
 from typing import List
 from perun import monitor
+from sizes import GSIZE_SQ, GSIZE_CB
+
+"""
+Bencharks so far:
+- concatenation along split axis
+- reshaping along split axis with new_split
+- resplitting (of a split array)
+- unsplit a split array
+"""
 
 
 @monitor()
 def concatenate(arrays):
-    # benchmark concatenation of 3 arrays with split 1, None, 1 respectively
     a = ht.concatenate(arrays, axis=1)
 
 
 @monitor()
-def reshape(arrays):
-    for array in arrays:
-        a = ht.reshape(array, (10000000, -1), new_split=1)
+def concatenate_nosplit(arrays):
+    a = ht.concatenate(arrays, axis=1)
+
+
+@monitor()
+def reshape(array):
+    a = ht.reshape(array, (array.shape[0] * array.shape[1], -1), new_split=1)
+
+
+@monitor()
+def reshape_nosplit(array):
+    a = ht.reshape(array, (array.shape[0] * array.shape[1], -1), new_split=1)
 
 
 @monitor()
-def resplit(array, new_split: List[int | None]):
-    for new_split in new_split:
-        a = ht.resplit(array, axis=new_split)
-        del a
+def resplit(array):
+    a = ht.resplit(array, axis=1)
+
+
+@monitor()
+def unsplit(array):
+    a = ht.resplit(array, axis=None)
 
 
 def run_manipulation_benchmarks():
-    sizes = [10000, 20000, 40000]
-    arrays = []
-    for size in sizes:
-        arrays.append(ht.zeros((1000, size), split=1))
+    arrays = [
+        ht.zeros((GSIZE_SQ // 2, GSIZE_SQ), split=1),
+        ht.zeros((GSIZE_SQ // 2, GSIZE_SQ), split=1),
+    ]
+    concatenate(arrays)
+    del arrays
+
+    arrays = [
+        ht.zeros((GSIZE_SQ // 2, GSIZE_SQ), split=0),
+        ht.zeros((GSIZE_SQ // 2, GSIZE_SQ), split=0),
+    ]
+    concatenate_nosplit(arrays)
+    del arrays
+
+    array = ht.zeros((GSIZE_CB, GSIZE_CB, GSIZE_CB), split=0)
     reshape(arrays)
+    del array
 
-    arrays = []
-    for i, size in enumerate(sizes):
-        if i == 1:
-            split = None
-        else:
-            split = 1
-        arrays.append(ht.zeros((1000, size), split=split))
-    concatenate(arrays)
+    array = ht.zeros((GSIZE_CB, GSIZE_CB, GSIZE_CB), split=2)
+    reshape_nosplit(arrays)
+    del array
 
-    if ht.comm.size > 1:
-        shape = [100, 50, 50, 20, 86]
-        n_elements = ht.array(shape).prod().item()
-        mem = n_elements * 4 / 1e9
-        array = ht.reshape(ht.arange(0, n_elements, split=0, dtype=ht.float32), shape) * (
-            ht.comm.rank + 1
-        )
+    array = ht.ones((GSIZE_SQ, GSIZE_SQ), split=0)
+    resplit(array)
 
-        resplit(array, [None, 2, 4])
+    array = ht.ones((GSIZE_SQ, GSIZE_SQ), split=0)
+    unsplit(array)
diff --git a/benchmarks/cb/preprocessing.py b/benchmarks/cb/preprocessing.py
index ebd75ed71c..869f441ae3 100644
--- a/benchmarks/cb/preprocessing.py
+++ b/benchmarks/cb/preprocessing.py
@@ -2,9 +2,17 @@
 import heat as ht
 from mpi4py import MPI
 from perun import monitor
+from sizes import GSIZE_TS_L, GSIZE_TS_S
 
-# we benchmark the in-place versions (`copy=False`) of the preprocessing functions
-# for each function, both the forward and the inverse transformation are applied
+"""
+Benchmarks in this file:
+- StandardScaler and inverse_transform
+- MinMaxScaler and inverse_transform
+- MaxAbsScaler and inverse_transform
+- RobustScaler and inverse_transform
+- Normalizer (without inverse, of course)
+All of them are both fit_transform and inverse_transform (together); data is split along the data axis.
+"""
 
 
 @monitor()
@@ -42,8 +50,8 @@ def apply_inplace_normalizer(X):
 
 
 def run_preprocessing_benchmarks():
-    n_data_points = 5000
-    n_features = 50
+    n_data_points = GSIZE_TS_L
+    n_features = GSIZE_TS_S
     X = ht.random.randn(n_data_points, n_features, split=0)
 
     apply_inplace_standard_scaler_and_inverse(X)
diff --git a/benchmarks/cb/sizes.py b/benchmarks/cb/sizes.py
index 1533c82144..7ae4fed5a7 100644
--- a/benchmarks/cb/sizes.py
+++ b/benchmarks/cb/sizes.py
@@ -10,7 +10,14 @@
 TS_FACTOR_loc = 2
 vTS_FACTOR_loc = 4
 
-# all other variables are calculated based on the number of elements per process
+"""
+all other variables are calculated based on the number of elements per process
+shape of a 2D square array: (GSIZE_SQ, GSIZE_SQ)
+shape of a 3D cube array: (GSIZE_CB, GSIZE_CB, GSIZE_CB)
+shape of a 2D tall-skinny array: (GSIZE_TS_L, GSIZE_TS_S)
+shape of a 2D very tall-skinny array: (GSIZE_vTS_L, GSIZE_vTS_S)
+similar for short-fat and very short-fat arrays...
+"""
 n_procs = ht.MPI_WORLD.size
 N_ELEMENTS_TOTAL = N_ELEMENTS_PER_PROC * n_procs
 
@@ -26,3 +33,10 @@
     (N_ELEMENTS_TOTAL / vTS_FACTOR_GLOB) ** 0.5
 )  # short dimension of very tall-skinny matrix
 GSIZE_vTS_L = GSIZE_TS_S * vTS_FACTOR_GLOB + 1  # long dimension of very tall-skinny matrix
+
+GSIZE_CB = int(N_ELEMENTS_TOTAL ** (1 / 3))  # dimension of a cube array
+
+"""
+Exceptions needed for the moment:
+"""
+LANCZOS_SIZE = 2**10