Skip to content

Commit

Permalink
unified data sizes for all benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
Hoppe committed Oct 7, 2024
1 parent 5fc46fa commit 3fd76cb
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 36 deletions.
5 changes: 2 additions & 3 deletions benchmarks/cb/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import heat as ht
from mpi4py import MPI
from perun import monitor
from sizes import GSIZE_TS_L, GSIZE_TS_S, GSIZE_SQ
from sizes import GSIZE_TS_L, GSIZE_TS_S, GSIZE_SQ, LANCZOS_SIZE

"""
Benchmarks in this file:
Expand Down Expand Up @@ -99,8 +99,7 @@ def run_linalg_benchmarks():
qr_split_1(a_1)
del a_1

n = 1000
A = ht.random.random((n, n), dtype=ht.float64, split=0)
A = ht.random.random((LANCZOS_SIZE, LANCZOS_SIZE), dtype=ht.float64, split=0)
B = A @ A.T
lanczos(B)
del A, B
Expand Down
79 changes: 51 additions & 28 deletions benchmarks/cb/manipulations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,49 +2,72 @@
import heat as ht
from typing import List
from perun import monitor
from sizes import GSIZE_SQ, GSIZE_CB

"""
Bencharks so far:
- concatenation along split axis
- reshaping along split axis with new_split
- resplitting (of a split array)
- unsplit a split array
"""


@monitor()
def concatenate(arrays):
# benchmark concatenation of 3 arrays with split 1, None, 1 respectively
a = ht.concatenate(arrays, axis=1)


@monitor()
def reshape(arrays):
for array in arrays:
a = ht.reshape(array, (10000000, -1), new_split=1)
def concatenate_nosplit(arrays):
a = ht.concatenate(arrays, axis=1)


@monitor()
def reshape(array):
a = ht.reshape(array, (array.shape[0] * array.shape[1], -1), new_split=1)


@monitor()
def reshape_nosplit(array):
a = ht.reshape(array, (array.shape[0] * array.shape[1], -1), new_split=1)


@monitor()
def resplit(array, new_split: List[int | None]):
for new_split in new_split:
a = ht.resplit(array, axis=new_split)
del a
def resplit(array):
a = ht.resplit(array, axis=1)


@monitor()
def unsplit(array):
a = ht.resplit(array, axis=None)


def run_manipulation_benchmarks():
sizes = [10000, 20000, 40000]
arrays = []
for size in sizes:
arrays.append(ht.zeros((1000, size), split=1))
arrays = [
ht.zeros((GSIZE_SQ // 2, GSIZE_SQ), split=1),
ht.zeros((GSIZE_SQ // 2, GSIZE_SQ), split=1),
]
concatenate(arrays)
del arrays

arrays = [
ht.zeros((GSIZE_SQ // 2, GSIZE_SQ), split=0),
ht.zeros((GSIZE_SQ // 2, GSIZE_SQ), split=0),
]
concatenate_nosplit(arrays)
del arrays

array = ht.zeros((GSIZE_CB, GSIZE_CB, GSIZE_CB), split=0)
reshape(arrays)
del array

arrays = []
for i, size in enumerate(sizes):
if i == 1:
split = None
else:
split = 1
arrays.append(ht.zeros((1000, size), split=split))
concatenate(arrays)
array = ht.zeros((GSIZE_CB, GSIZE_CB, GSIZE_CB), split=2)
reshape_nosplit(arrays)
del array

if ht.comm.size > 1:
shape = [100, 50, 50, 20, 86]
n_elements = ht.array(shape).prod().item()
mem = n_elements * 4 / 1e9
array = ht.reshape(ht.arange(0, n_elements, split=0, dtype=ht.float32), shape) * (
ht.comm.rank + 1
)
array = ht.ones((GSIZE_SQ, GSIZE_SQ), split=0)
resplit(array)

resplit(array, [None, 2, 4])
array = ht.ones((GSIZE_SQ, GSIZE_SQ), split=0)
unsplit(array)
16 changes: 12 additions & 4 deletions benchmarks/cb/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,17 @@
import heat as ht
from mpi4py import MPI
from perun import monitor
from sizes import GSIZE_TS_L, GSIZE_TS_S

# we benchmark the in-place versions (`copy=False`) of the preprocessing functions
# for each function, both the forward and the inverse transformation are applied
"""
Benchmarks in this file:
- StandardScaler and inverse_transform
- MinMaxScaler and inverse_transform
- MaxAbsScaler and inverse_transform
- RobustScaler and inverse_transform
- Normalizer (without inverse, of course)
All of them are both fit_transform and inverse_transform (together); data is split along the data axis.
"""


@monitor()
Expand Down Expand Up @@ -42,8 +50,8 @@ def apply_inplace_normalizer(X):


def run_preprocessing_benchmarks():
n_data_points = 5000
n_features = 50
n_data_points = GSIZE_TS_L
n_features = GSIZE_TS_S
X = ht.random.randn(n_data_points, n_features, split=0)

apply_inplace_standard_scaler_and_inverse(X)
Expand Down
16 changes: 15 additions & 1 deletion benchmarks/cb/sizes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,14 @@
TS_FACTOR_loc = 2
vTS_FACTOR_loc = 4

# all other variables are calculated based on the number of elements per process
"""
all other variables are calculated based on the number of elements per process
shape of a 2D square array: (GSIZE_SQ, GSIZE_SQ)
shape of a 3D cube array: (GSIZE_CB, GSIZE_CB, GSIZE_CB)
shape of a 2D tall-skinny array: (GSIZE_TS_L, GSIZE_TS_S)
shape of a 2D very tall-skinny array: (GSIZE_vTS_L, GSIZE_vTS_S)
similar for short-fat and very short-fat arrays...
"""
n_procs = ht.MPI_WORLD.size
N_ELEMENTS_TOTAL = N_ELEMENTS_PER_PROC * n_procs

Expand All @@ -26,3 +33,10 @@
(N_ELEMENTS_TOTAL / vTS_FACTOR_GLOB) ** 0.5
) # short dimension of very tall-skinny matrix
GSIZE_vTS_L = GSIZE_TS_S * vTS_FACTOR_GLOB + 1 # long dimension of very tall-skinny matrix

GSIZE_CB = int(N_ELEMENTS_TOTAL ** (1 / 3)) # dimension of a cube array

"""
Exceptions needed for the moment:
"""
LANCZOS_SIZE = 2**10

0 comments on commit 3fd76cb

Please sign in to comment.