You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
#include"cuda_runtime.h"#include<cstdarg>#include"mpi.h"#include"nccl.h"#include<stdint.h>#include<stdio.h>#include<stdlib.h>#include<unistd.h>#include<sys/time.h>#include<time.h>#include<ctime>intglocal_rank=-1;
#defineMPICHECK(cmd) \
do { \
int e = cmd; \
if (e != MPI_SUCCESS) { \
printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e); \
exit(EXIT_FAILURE); \
} \
} while (0)
#defineCUDACHECK(cmd) \
do { \
cudaError_t e = cmd; \
if (e != cudaSuccess) { \
printf("rank %d Failed: Cuda error %s:%d '%s'\n", glocal_rank, __FILE__, __LINE__, \
cudaGetErrorString(e)); \
exit(EXIT_FAILURE); \
} \
} while (0)
#defineNCCLCHECK(cmd) \
do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \
ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while (0)
staticuint64_tgetHostHash(constchar*string) {
// Based on DJB2a, result = result * 33 ^ charuint64_tresult=5381;
for (intc=0; string[c] !='\0'; c++) {
result= ((result << 5) +result) ^ string[c];
}
returnresult;
}
staticvoidgetHostName(char*hostname, intmaxlen) {
gethostname(hostname, maxlen);
for (inti=0; i<maxlen; i++) {
if (hostname[i] =='.') {
hostname[i] ='\0';
return;
}
}
}
voidlog_print(constchar*format, ...) {
va_listargs;
va_start(args, format);
chartimestampBuffer[27] ="";
structtimevaltv;
gettimeofday(&tv, NULL);
std::tmtimeinfo;
localtime_r(&tv.tv_sec, &timeinfo);
snprintf(timestampBuffer, sizeof(timestampBuffer),
"[%04d-%02d-%02dT%02d:%02d:%02d.%03ld] ", timeinfo.tm_year+1900,
timeinfo.tm_mon+1, timeinfo.tm_mday, timeinfo.tm_hour,
timeinfo.tm_min, timeinfo.tm_sec, tv.tv_usec / 1000);
printf("%s", timestampBuffer);
vprintf(format, args);
va_end(args);
}
// 计算两个 timeval 结构之间的时间差,返回微秒数longget_time_diff_microseconds(structtimevalstart, structtimevalend) {
longseconds=end.tv_sec-start.tv_sec;
longmicroseconds=end.tv_usec-start.tv_usec;
returnseconds*1000000+microseconds;
}
intmain(intargc, char*argv[]) {
intsize=60*1024*1024;
intrunCount=1;
structtimevalstart, end;
intmyRank, nRanks, localRank=0;
// initializing MPIMPICHECK(MPI_Init(&argc, &argv));
MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));
// calculating localRank based on hostname which is used in selecting a GPUuint64_thostHashs[nRanks];
charhostname[1024];
getHostName(hostname, 1024);
hostHashs[myRank] =getHostHash(hostname);
MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs,
sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));
for (intp=0; p<nRanks; p++) {
if (p==myRank)
break;
if (hostHashs[p] ==hostHashs[myRank])
localRank++;
}
cudaStream_ts;
ncclUniqueIdid;
ncclComm_tcomm;
float*sendbuff, *recvbuff;
float*sendbuff2, *recvbuff2;
glocal_rank=myRank;
// get NCCL unique ID at rank 0 and broadcast it to all othersif (myRank==0)
ncclGetUniqueId(&id);
MPICHECK(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
printf("set cuda device: %d\n",localRank);
// picking a GPU based on localRank, allocate device buffersCUDACHECK(cudaSetDevice(localRank));
CUDACHECK(cudaMalloc(&sendbuff, size*sizeof(float)));
CUDACHECK(cudaMalloc(&recvbuff, size*sizeof(float)));
CUDACHECK(cudaMalloc(&recvbuff2, size*sizeof(float)));
CUDACHECK(cudaMalloc(&sendbuff2, size*sizeof(float)));
CUDACHECK(cudaStreamCreate(&s));
// initializing NCCLNCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank));
size_trankOffset=size*4;
intallgatherSendCount=size/nRanks;
auto call_func= [&](intcount) {
for (inti=0; i<count; i++) {
NCCLCHECK(ncclGroupStart());
NCCLCHECK(ncclAllReduce((constvoid*)sendbuff, (void*)recvbuff, size,
ncclFloat32, ncclSum, comm, s));
if (allgatherSendCount>0) {
NCCLCHECK(ncclAllGather((constvoid*)(sendbuff2+size*allgatherSendCount), (void*)recvbuff2, allgatherSendCount,
ncclFloat32, comm, s));
}
NCCLCHECK(ncclGroupEnd());
}
// completing NCCL operation by synchronizing on the CUDA streamCUDACHECK(cudaStreamSynchronize(s));
};
// warncall_func(5);
// 获取开始时间, get begin timeif (gettimeofday(&start, NULL) !=0) {
perror("gettimeofday");
return1;
}
runCount=10000;
call_func(runCount);
// 获取结束时间, get end timeif (gettimeofday(&end, NULL) !=0) {
perror("gettimeofday");
return1;
}
// 计算时间差longelapsed_microseconds=get_time_diff_microseconds(start, end);
printf("rank: %d run duration: %lf 秒\n", myRank, (double)elapsed_microseconds/(double)1000000);
// free device buffersCUDACHECK(cudaFree(sendbuff));
CUDACHECK(cudaFree(recvbuff));
// finalizing NCCLncclCommDestroy(comm);
// finalizing MPIMPICHECK(MPI_Finalize());
printf("[MPI Rank %d] runCount: %d Success \n", myRank, runCount);
return0;
}
run with nccl version 2.21.5 , output duration:
set cuda device: 0
set cuda device: 1
set cuda device: 2
set cuda device: 3
set cuda device: 4
set cuda device: 5
set cuda device: 6
set cuda device: 7
rank: 0 run duration: 40.983264 秒
rank: 1 run duration: 40.983236 秒
rank: 2 run duration: 40.983254 秒
rank: 3 run duration: 40.983243 秒
rank: 4 run duration: 40.983244 秒
rank: 5 run duration: 40.983269 秒
rank: 7 run duration: 40.983285 秒
rank: 6 run duration: 40.983284 秒
[MPI Rank 5] runCount: 10000 Success
[MPI Rank 1] runCount: 10000 Success
[MPI Rank 3] runCount: 10000 Success
[MPI Rank 7] runCount: 10000 Success
[MPI Rank 6] runCount: 10000 Success
[MPI Rank 0] runCount: 10000 Success
[MPI Rank 2] runCount: 10000 Success
[MPI Rank 4] runCount: 10000 Success
run with nccl version 2.18.3 , output duration:
set cuda device: 0
set cuda device: 1
set cuda device: 2
set cuda device: 4
set cuda device: 5
set cuda device: 6
set cuda device: 7
set cuda device: 3
rank: 0 run duration: 17.108217 秒
rank: 1 run duration: 17.108238 秒
rank: 7 run duration: 17.108258 秒
rank: 2 run duration: 17.108236 秒
rank: 6 run duration: 17.108263 秒
rank: 3 run duration: 17.108257 秒
rank: 4 run duration: 17.108248 秒
rank: 5 run duration: 17.108247 秒
[MPI Rank 1] runCount: 10000 Success
[MPI Rank 7] runCount: 10000 Success
[MPI Rank 3] runCount: 10000 Success
[MPI Rank 5] runCount: 10000 Success
[MPI Rank 6] runCount: 10000 Success
[MPI Rank 4] runCount: 10000 Success
[MPI Rank 0] runCount: 10000 Success
[MPI Rank 2] runCount: 10000 Success
Why does running on version 2.18.3 yield better performance? I also tested that running allreduce alone in version 2.21.5 takes about 10 seconds, and running allgather alone takes about 6 seconds.
The text was updated successfully, but these errors were encountered:
Just to make sure that I understood: you are saying that grouping allreduce and allgather results in a considerable slowdown with NCCL 2.21.5 compared to NCCL 2.18.3, correct? And that if you run each collective separately, the performance is as expected (10+6=16 -- more-or-less in-line with the 17s seen with 2.18.3)?
I don't have a ready answer for you but I would recommend that you try with the most recent NCCL release (2.24.3 was released today!). If you continue seeing this issue with the current release, we'll be happy to investigate. For that, we would probably want to start by comparing the logs from different versions, obtained with NCCL_DEBUG=INFO NCCL_DEBUG_SUBSYS=INIT,ENV,GRAPH,TUNING (no need to run for 10000 iterations in that case -- you could actually reduce runCount to 0 as the warm-up iterations should tell us all we would need to know).
cuda version:
nccl version: 2.21.5
demo code:
run with nccl version 2.21.5 , output duration:
run with nccl version 2.18.3 , output duration:
Why does running on version 2.18.3 yield better performance? I also tested that running
allreduce
alone in version 2.21.5 takes about 10 seconds, and runningallgather
alone takes about 6 seconds.The text was updated successfully, but these errors were encountered: