NCCL code example questions

siri · April 9, 2020, 8:41am

Hi, I take nccl code from: Examples — NCCL 2.14.3 documentation, add some print code, but print 0, can anyone tell me why?

#include <stdio.h>
#include “cuda_runtime.h”
#include “nccl.h”

#define CUDACHECK(cmd) do {
cudaError_t e = cmd;
if( e != cudaSuccess ) {
printf(“Failed: Cuda error %s:%d ‘%s’\n”,
FILE,LINE,cudaGetErrorString(e));
exit(EXIT_FAILURE);
}
} while(0)

#define NCCLCHECK(cmd) do {
ncclResult_t r = cmd;
if (r!= ncclSuccess) {
printf(“Failed, NCCL error %s:%d ‘%s’\n”,
FILE,LINE,ncclGetErrorString(r));
exit(EXIT_FAILURE);
}
} while(0)

int main(int argc, char* argv)
{
ncclComm_t comms[4];

//managing 4 devices
int nDev = 4;
int size = 3210241024;
int devs[4] = { 0, 1, 2, 3 };

//allocating and initializing device buffers
float** sendbuff = (float**)malloc(nDev * sizeof(float*));
float** recvbuff = (float**)malloc(nDev * sizeof(float*));
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);

float* hsend =(float*)malloc(sizenDevsizeof(float));
float* hrecv = (float*)malloc(sizenDevsizeof(float));

for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
CUDACHECK(cudaMalloc(sendbuff + i, size * sizeof(float)));
CUDACHECK(cudaMalloc(recvbuff + i, size * sizeof(float)));
CUDACHECK(cudaMemset(sendbuff[i], 1, size * sizeof(float)));
CUDACHECK(cudaMemset(recvbuff[i], 0, size * sizeof(float)));
CUDACHECK(cudaStreamCreate(s+i));
}

//initializing NCCL
NCCLCHECK(ncclCommInitAll(comms, nDev, devs));

//calling NCCL communication API. Group API is required when using
//multiple devices per thread
NCCLCHECK(ncclGroupStart());
for (int i = 0; i < nDev; ++i)
NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], size, ncclFloat, ncclSum,
comms[i], s[i]));
NCCLCHECK(ncclGroupEnd());

//synchronizing on CUDA streams to wait for completion of NCCL operation
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
CUDACHECK(cudaStreamSynchronize(s[i]));
CUDACHECK(cudaMemcpy(hsend + isize, sendbuff[i], sizeof(float) * size, cudaMemcpyDeviceToHost));
CUDACHECK(cudaMemcpy(hrecv + isize, recvbuff[i], sizeof(float) * size, cudaMemcpyDeviceToHost));
}

for (int i = 0; i < size*nDev; ++i)
printf(“send:%f, recv:%f\n”, hsend[i], hrecv[i]);

//free device buffers
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
CUDACHECK(cudaFree(sendbuff[i]));
CUDACHECK(cudaFree(recvbuff[i]));
}
//finalizing NCCL
for(int i = 0; i < nDev; ++i)
ncclCommDestroy(comms[i]);
free(hsend);
free(hrecv);
printf(“Success \n”);
return 0;
}

dariusz.sciebura · January 11, 2024, 7:35am

Hi, have you configured nvcc with proper arch? If you set it up to native, the problem you observe should vanish.

Topic		Replies	Views
NVProf for NCCL program GPU-Accelerated Libraries nccl	2	972	May 28, 2021
ncclGroupEnd "unhandled cuda error" CUDA Programming and Performance	8	3305	October 23, 2020
How to use emulation mode CUDA Programming and Performance	0	778	July 31, 2009
unresolved external symbol _main referenced in function ___tmainCRTStartup CUDA Programming and Performance	7	9312	February 22, 2011
Proccess block when call Nccl reduce CUDA Programming and Performance	1	761	May 19, 2018
Why am I unable to compile a CUDA program even though I have nvcc? CUDA Setup and Installation	3	622	December 4, 2023
validation of CUDA installation Jetson AGX Xavier	6	920	October 18, 2021
nvcc and googletest CUDA Programming and Performance	5	16370	July 7, 2011
NVCC Compile Shared Library CUDA Programming and Performance	10	1851	October 12, 2021
CUDA 2.3a/nvcc frustrations CUDA Programming and Performance	13	30980	September 22, 2009

NCCL code example questions

Related topics