NCCL code example questions

Hi, I take nccl code from: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/examples.html, add some print code, but print 0, can anyone tell me why?

#include <stdio.h>
#include “cuda_runtime.h”
#include “nccl.h”

#define CUDACHECK(cmd) do {
cudaError_t e = cmd;
if( e != cudaSuccess ) {
printf(“Failed: Cuda error %s:%d ‘%s’\n”,
FILE,LINE,cudaGetErrorString(e));
exit(EXIT_FAILURE);
}
} while(0)

#define NCCLCHECK(cmd) do {
ncclResult_t r = cmd;
if (r!= ncclSuccess) {
printf(“Failed, NCCL error %s:%d ‘%s’\n”,
FILE,LINE,ncclGetErrorString®);
exit(EXIT_FAILURE);
}
} while(0)

int main(int argc, char* argv)
{
ncclComm_t comms[4];

//managing 4 devices
int nDev = 4;
int size = 3210241024;
int devs[4] = { 0, 1, 2, 3 };

//allocating and initializing device buffers
float** sendbuff = (float**)malloc(nDev * sizeof(float*));
float** recvbuff = (float**)malloc(nDev * sizeof(float*));
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);

float* hsend =(float*)malloc(sizenDevsizeof(float));
float* hrecv = (float*)malloc(sizenDevsizeof(float));

for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
CUDACHECK(cudaMalloc(sendbuff + i, size * sizeof(float)));
CUDACHECK(cudaMalloc(recvbuff + i, size * sizeof(float)));
CUDACHECK(cudaMemset(sendbuff[i], 1, size * sizeof(float)));
CUDACHECK(cudaMemset(recvbuff[i], 0, size * sizeof(float)));
CUDACHECK(cudaStreamCreate(s+i));
}

//initializing NCCL
NCCLCHECK(ncclCommInitAll(comms, nDev, devs));

//calling NCCL communication API. Group API is required when using
//multiple devices per thread
NCCLCHECK(ncclGroupStart());
for (int i = 0; i < nDev; ++i)
NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], size, ncclFloat, ncclSum,
comms[i], s[i]));
NCCLCHECK(ncclGroupEnd());

//synchronizing on CUDA streams to wait for completion of NCCL operation
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
CUDACHECK(cudaStreamSynchronize(s[i]));
CUDACHECK(cudaMemcpy(hsend + isize, sendbuff[i], sizeof(float) * size, cudaMemcpyDeviceToHost));
CUDACHECK(cudaMemcpy(hrecv + i
size, recvbuff[i], sizeof(float) * size, cudaMemcpyDeviceToHost));
}

for (int i = 0; i < size*nDev; ++i)
printf(“send:%f, recv:%f\n”, hsend[i], hrecv[i]);

//free device buffers
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
CUDACHECK(cudaFree(sendbuff[i]));
CUDACHECK(cudaFree(recvbuff[i]));
}
//finalizing NCCL
for(int i = 0; i < nDev; ++i)
ncclCommDestroy(comms[i]);
free(hsend);
free(hrecv);
printf(“Success \n”);
return 0;
}