Skip to content

Commit

Permalink
Use hipify to convert cufft -> hipfft
Browse files Browse the repository at this point in the history
  • Loading branch information
ashwinvis committed Sep 22, 2023
1 parent b4b04f1 commit 32b150a
Show file tree
Hide file tree
Showing 6 changed files with 352 additions and 9 deletions.
9 changes: 8 additions & 1 deletion setup_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ def create_ext(base_name):
libraries.append("p3dfft")
elif "cufft" in base_name:
libraries.extend(["cufft", "mpi_cxx"])
elif "hipfft" in base_name:
# TODO: should be possible to use cufft as backend instead of rocfft
libraries.extend(["hipfft", "rocfft"])

return Extension(
name="fluidfft.fft" + dim + "." + base_name,
Expand Down Expand Up @@ -162,6 +165,10 @@ def base_names_from_config(config):
base_names.extend(["fft2d_with_cufft"])
base_names.extend(["fft3d_with_cufft"])

if config["hipfft"]["use"]:
base_names.extend(["fft2d_with_hipfft"])
base_names.extend(["fft3d_with_hipfft"])

if config["pfft"]["use"] and not use_mkl_intel:
base_names.extend(["fft3dmpi_with_pfft"])

Expand Down Expand Up @@ -232,7 +239,7 @@ def update_with_config(key):
if configuration["fftw3"]["use"]:
update_with_config("fftw3")

keys = ["pfft", "p3dfft", "cufft"]
keys = ["pfft", "p3dfft", "cufft", "hipfft"]

ext_modules = []

Expand Down
8 changes: 8 additions & 0 deletions setup_make.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,9 @@ def make_command_obj_from_cpp(
"-gencode arch=compute_50,code=sm_50 "
"-gencode arch=compute_50,code=compute_50 -Xcompiler -fPIC"
)
elif "hipfft" in cpp_file:
HIPCC = os.getenv("HIPCC", "hipcc")
command = HIPCC + " -fPIC"

command = [
w
Expand All @@ -272,6 +275,8 @@ def make_command_obj_from_cpp(
if word == "-pthread":
continue
command.append(word)
elif "hipfft" in cpp_file:
pass
else:
mpicxx = _distconfig["MPICXX"].split()
mpicxx.extend(command[1:])
Expand Down Expand Up @@ -324,6 +329,9 @@ def make_command_ext_from_objs(
if "cufft" in ext_file:
NVCC = os.getenv("NVCC", "nvcc")
command = (NVCC + " -Xcompiler -pthread -shared").split()
elif "hipfft" in ext_file:
HIPCC = os.getenv("HIPCC", "hipcc")
command = (HIPCC + " -pthread -shared").split()

command += obj_files + ["-o", ext_file]
if lib_dirs is not None:
Expand Down
29 changes: 23 additions & 6 deletions src_cpp/2d/Makefile
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
FFTW:=no
SINGLE_PREC:=no
OMP:=no
CUDA:=yes
CUDA:=no
HIP:=yes
PFFT:=no
P3DFFT:=no

LD:=mpic++
CC:=mpic++
LD:=CC
CC:=CC
FLAGS:=
LDFLAGS:=
LIBS:= -ldl -lhwloc -lm -lgfortran

LIB_DIR:=$(LDEV_PATH)
INC_DIR:=-I. -I../base $(IDEV_PATH)
INC_DIR:=-I. -I../base $(IDEV_PATH)


SOURCES_CPP := $(wildcard base*.cpp)
SOURCES_CPP += fft2d_with_fftw1d.cpp fft2d_with_fftw2d.cpp fft2dmpi_with_fftwmpi2d.cpp test_bench.cpp
SOURCES_CPP := $(wildcard base*.cpp) test_bench.cpp

ifeq "$(FFTW)" "yes"
SOURCES_CPP += fft2d_with_fftw1d.cpp fft2d_with_fftw2d.cpp fft2dmpi_with_fftwmpi2d.cpp
endif

ifeq "$(CUDA)" "yes"
SOURCES_CU := $(wildcard *.cu)
Expand All @@ -29,6 +33,15 @@ INC_DIR_CU:=-I/usr/lib/openmpi/include -I/usr/lib/openmpi/include/openmpi
INC_DIR+=-I/opt/cuda/NVIDIA_CUDA-6.0_Samples/common/inc/
endif

ifeq "$(HIP)" "yes"
SOURCES_CPP += $(wildcard *hipfft.cpp)
LD:=hipcc
LIBS_HIP:=-L${FFTW_DIR} -L/opt/rocm/lib -lhipfft -lrocfft
LIBS:=${LIBS_HIP}
FLAGS_HIP:=${LIBS_HIP}
INC_DIR_HIP:=-I${FFTW_INC}
endif

ifeq "$(OMP)" "yes"
FLAGS+=-DOMP
ifeq "$(CUDA)" "yes"
Expand Down Expand Up @@ -124,6 +137,10 @@ test_bench.out: $(OBJECTS) $(OBJECTS_CPP_BASE)
# make $@ from $<
$(CC) $(FLAGS) $(INC_DIR) -c $< -o build/$@

%hipfft.o: %hipfft.cpp
# make $@ from $<
hipcc $(FLAGS_HIP) $(INC_DIR) $(INC_DIR_HIP) -c $< -o build/$@

test:
./test_bench.out --N0=64 --N1=32 --N2=16

Expand Down
268 changes: 268 additions & 0 deletions src_cpp/2d/fft2d_with_hipfft.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
#include "hip/hip_runtime.h"


#include <iostream>
using namespace std;

#include <stdlib.h>

#include <sys/time.h>
#include <fft2d_with_hipfft.h>



// KERNEL HIP
// Complex scale
static __device__ __host__ inline dcomplex ComplexScale(dcomplex a, myreal s)
{
dcomplex c;
c.x = s * a.x;
c.y = s * a.y;
return c;
}

__global__ void vectorNorm(const myreal norm, dcomplex *A, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i < numElements)
{
A[i] = ComplexScale(A[i], norm);
}
}

////////////////// FIN KERNEL HIP

FFT2DWithHIPFFT::FFT2DWithHIPFFT(int argN0, int argN1):
BaseFFT2D::BaseFFT2D(argN0, argN1)
{
struct timeval start_time, end_time;
myreal total_usecs;

this->_init();

/* y corresponds to dim 0 in physical space */
/* x corresponds to dim 1 in physical space */
ny = N0;
nx = N1;

nX0 = N0;
nX0loc = nX0;
nX1 = N1;
nX1loc = nX1;


nKx = nx/2+1;
nKxloc = nKx;
nKy = ny;

/* This 2D fft is NOT transposed */
nK0 = nKy;
nK0loc = nK0;
nK1 = nKx;
nK1loc = nK1;


mem_sizer = sizeof(myreal) * N0 * N1 ;//taille de arrayX
int new_size = nK0 * nK1 ;
mem_size = 2 * sizeof(myreal) * new_size ;//taille de arrayK

gettimeofday(&start_time, NULL);
// Allocate device memory for signal
(hipMalloc((void **)&data, mem_size));
(hipMalloc((void **)&datar, mem_sizer));

// HIPFFT plan
#ifdef SINGLE_PREC
(hipfftPlan2d(&plan, nX0, nX1, HIPFFT_R2C));
(hipfftPlan2d(&plan1, nX0, nX1, HIPFFT_C2R));
#else
(hipfftPlan2d(&plan, nX0, nX1, HIPFFT_D2Z));
(hipfftPlan2d(&plan1, nX0, nX1, HIPFFT_Z2D));
#endif

gettimeofday(&end_time, NULL);

total_usecs = (end_time.tv_sec-start_time.tv_sec) +
(end_time.tv_usec-start_time.tv_usec)/1000000.;

if (rank == 0)
printf("Initialization (%s) done in %f s\n",
this->get_classname(), total_usecs);
}


void FFT2DWithHIPFFT::destroy(void)
{
// cout << "Object is being destroyed" << endl;
hipFree(data);
hipFree(datar);
hipfftDestroy(plan);
hipfftDestroy(plan1);
}


FFT2DWithHIPFFT::~FFT2DWithHIPFFT(void)
{
}


char const* FFT2DWithHIPFFT::get_classname()
{ return "FFT2DWithHIPFFT";}


myreal FFT2DWithHIPFFT::compute_energy_from_X(myreal* fieldX)
{
int ii,jj;
myreal energy = 0.;
myreal energy1;

for (ii=0; ii<nX0; ii++)
{
energy1=0.;
for (jj=0; jj<nX1; jj++)
energy1 += pow(fieldX[ii*nX1+jj], 2);
energy += energy1 / nX1;
}
//cout << "energyX=" << energy / nX0 / 2 << endl;

return energy / nX0 / 2;
}

myreal FFT2DWithHIPFFT::compute_energy_from_K(mycomplex* fieldK)
{
int i0, i1;
double energy = 0;
double energy0 = 0;

// modes i1_seq = iKx = last = nK1 - 1
i1 = nK1 - 1;
for (i0=0; i0<nK0; i0++)
// we must divide by 2 ==> after
energy += (double) pow(abs(fieldK[i1 + i0*nK1]), 2);
if (N1%2 == 0)
// divide by 2!!!
energy *= 0.5;

// other modes
for (i0=0; i0<nK0; i0++)
for (i1=1; i1<nK1-1; i1++)
energy += (double) pow(abs(fieldK[i1 + i0*nK1]), 2);

// modes i1_seq = iKx = 0
i1 = 0;
for (i0=0; i0<nK0; i0++)
//we must divide by 2 ==> after
energy0 += (double) pow(abs(fieldK[i0*nK1]), 2);

energy += energy0*0.5;

//cout << "energyK=" << energy<< endl;
return (myreal) energy;
}


myreal FFT2DWithHIPFFT::sum_wavenumbers(myreal* fieldK)
{
int i0, i1;
double sum_tot = 0;
double sum0 = 0;

// modes i1_seq = iKx = last = nK1 - 1
i1 = nK1 - 1;
for (i0=0; i0<nK0; i0++)
//we must divide by 2 ==> after
sum_tot += (double) fieldK[i1 + i0*nK1];
if (N1%2 == 0)
sum_tot *= 0.5; //divide by 2!!!

// other modes
for (i0=0; i0<nK0; i0++)
for (i1=1; i1<nK1-1; i1++)
sum_tot += (double) fieldK[i1 + i0*nK1];

// modes i1_seq = iKx = 0
i1 = 0;
for (i0=0; i0<nK0; i0++)
//we must divide by 2 ==> after
sum0 += (double) fieldK[i0*nK1];

sum_tot += 0.5 * sum0;

//cout << "sumK=" << sum<< endl;
return (myreal) 2*sum_tot;
}



myreal FFT2DWithHIPFFT::compute_mean_from_X(myreal* fieldX)
{
myreal mean,mean1;
int ii,jj;
mean=0.;

for (ii=0; ii<nX0; ii++)
{
mean1=0.;
for (jj=0; jj<nX1; jj++)
{
mean1 += fieldX[ii*nX1+jj];
}
mean += mean1 / nX1;
}
return mean / nX0;
}


myreal FFT2DWithHIPFFT::compute_mean_from_K(mycomplex* fieldK)
{
myreal mean;
mean = real(fieldK[0]);

return mean;
}


void FFT2DWithHIPFFT::fft(myreal *fieldX, mycomplex *fieldK)
{
// cout << "FFT2DWithHIPFFT::fft" << endl;
// Copy host memory to device
(hipMemcpy(datar, fieldX, mem_sizer, hipMemcpyHostToDevice));

// Transform signal and kernel
//printf("Transforming signal hipfftExecD2Z\n");
#ifdef SINGLE_PREC
(hipfftExecR2C(plan, (hipfftReal *)datar, (hipfftComplex *)data));
#else
(hipfftExecD2Z(plan, (hipfftDoubleReal *)datar, (hipfftDoubleComplex *)data));
#endif

// Launch the Vector Norm HIP Kernel
myreal norm = inv_coef_norm;
// printf("HIP kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
int threadsPerBlock = 256;
int blocksPerGrid =(nK0 * nK1 + threadsPerBlock - 1) / threadsPerBlock;
hipLaunchKernelGGL(vectorNorm, blocksPerGrid, threadsPerBlock, 0, 0, norm, data, nK0 * nK1 );

// Copy host device to memory
(hipMemcpy(fieldK, data, mem_size, hipMemcpyDeviceToHost));
}


void FFT2DWithHIPFFT::ifft(mycomplex *fieldK, myreal *fieldX)
{
//cout << "FFT2DWithHIPFFT::ifft" << endl;
// Copy host memory to device
(hipMemcpy(data, fieldK, mem_size, hipMemcpyHostToDevice));

// FFT on DEVICE
#ifdef SINGLE_PREC
(hipfftExecC2R(plan1, (hipfftComplex *)data, (hipfftReal *)datar));
#else
(hipfftExecZ2D(plan1, (hipfftDoubleComplex *)data, (hipfftDoubleReal *)datar));
#endif

// Copy host device to memory
(hipMemcpy(fieldX, datar, mem_sizer, hipMemcpyDeviceToHost));
}

Loading

0 comments on commit 32b150a

Please sign in to comment.