ncclGroupEnd "unhandled cuda error"

Trying to use NCCL 2.05 for multi-GPU reduction of complex values on AWS p3 Volta instances with Ubuntu 16.04 & SDK 9.1

My code:

ncclComm_t comms[MAX_GPU];
NCCLCHECK(ncclCommInitAll(comms, nDevices, nullptr));

NCCLCHECK(ncclGroupStart());
for (int j = 1; j < nlim; j++) 
{
	NCCLCHECK(ncclReduce((const void*)params[j]->dev_wrxsa[0], (void*)params[0]->dev_wrxsa[0], 
			npulse_size * 2, ncclFloat, ncclSum, 0, comms[j], streams[j]));
}
NCCLCHECK(ncclGroupEnd());

I am getting “unhandled cuda error” on the ncclGroupEnd function call.
If I delete that line, the code will sometimes complete w/o error, but mostly core dumps.

The send and receive buffers are allocated with cudaMallocManaged.

I’m expecting this to sum all other GPU’s buffers into the GPU 0 buffer.

How can I figure out what’s causing the “unhandled cuda error”?

cuda-memcheck ?

Thanks for the suggestion txbob.

Ran memcheck and all I get (besides a bunch of peer access already enabled errors) is:

========= Program hit cudaErrorNotSupported (error 71) due to "operation not supported" on CUDA API call to cudaEventRecord.
Failed, NCCL error RadarSim/Radar.cu:733 'unhandled cuda error'
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/jet/lib/libcuda.so.1 [0x311153]
=========     Host Frame:./clutterProc.exe [0x8ba5a]
=========     Host Frame:./clutterProc.exe [0x1ab5d]
=========     Host Frame:./clutterProc.exe [0x1611a]
=========     Host Frame:./clutterProc.exe [0xa724]
=========     Host Frame:./clutterProc.exe [0x80e7]
=========     Host Frame:/jet/lib/libc.so.6 (__libc_start_main + 0xf0) [0x20660]
=========     Host Frame:./clutterProc.exe [0x6df9]

Cannot make anything out of ^this.

I even disable all my peer access requests, but I still get superfluous errors about it.

So now what you’re really asking is “please help me debug my code”.

For that, I would not even think about getting started without a complete test case.

In this particular case, I’m reasonably sure there is a NCCL allreduce sample code, so I would certainly suggest you study that.

No, my code is working perfectly, until I add NCCL calls to make NVLink speeds work in Linux.

The code I added which is detailed here, is almost identical to the allreduce example, except I am using ncclReduce and complex data types as float x 2. Should work since I’m just summing real and imag independently.

In fact the previous version works very fast with NVLink speeds in Windows on a 4 GPU instance with a custom kernel for summing complex float buffers.

I just cannot identify anything which could cause ncclGroupEnd to create an “unhandled cuda error” and I don’t even know what that is or what COULD cause it.

I compiled and ran the allReduce example code on an AWS 8 GPU p3 instance running Ubuntu 16.
Linked with libnccl_static.a from NCCL v2.0.4
It runs with no errors.

I then changed it to call ncclReduce with root node 0 and it also runs with no errors.

allreduce.cpp listed below

#include <stdio.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "nccl.h"
#define EXIT_FAILURE 1
#define MAX_GPU 8

#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
  if( e != cudaSuccess ) {                          \
    printf("Failed: Cuda error %s:%d '%s'\n",             \
        __FILE__,__LINE__,cudaGetErrorString(e));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)

#define NCCLCHECK(cmd) do {                         \
  ncclResult_t r = cmd;                             \
  if (r!= ncclSuccess) {                            \
    printf("Failed, NCCL error %s:%d '%s'\n",             \
        __FILE__,__LINE__,ncclGetErrorString(r));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)

int main(int argc, char* argv[])
{
  ncclComm_t comms[MAX_GPU];
  //managing  devices
  int nDev = 4;
  CUDACHECK(cudaGetDeviceCount(&nDev));
  printf("%d gpus found\n", nDev);
  int size = 32*1024*1024;
  int devs[4] = { 0, 1, 2, 3 };

  //allocating and initializing device buffers
  float** sendbuff = (float**)malloc(nDev * sizeof(float*));
  float** recvbuff = (float**)malloc(nDev * sizeof(float*));
  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);

  for (int i = 0; i < nDev; ++i) {
    printf("init gpu %d\n", i);
    CUDACHECK(cudaSetDevice(i));
    CUDACHECK(cudaMalloc(sendbuff + i, size * sizeof(float)));
    CUDACHECK(cudaMalloc(recvbuff + i, size * sizeof(float)));
    CUDACHECK(cudaMemset(sendbuff[i], 1, size * sizeof(float)));
    CUDACHECK(cudaMemset(recvbuff[i], 0, size * sizeof(float)));
    CUDACHECK(cudaStreamCreate(s+i));
  }

  //initializing NCCL
  NCCLCHECK(ncclCommInitAll(comms, nDev, 0));

  //calling NCCL communication API. Group API is required when using
  //multiple devices per thread
  NCCLCHECK(ncclGroupStart());
  for (int i = 0; i < nDev; ++i)
    NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], size, ncclFloat, ncclSum,
        comms[i], s[i]));
  NCCLCHECK(ncclGroupEnd());

  //synchronizing on CUDA streams to wait for completion of NCCL operation
  for (int i = 0; i < nDev; ++i) {
    CUDACHECK(cudaSetDevice(i));
    CUDACHECK(cudaStreamSynchronize(s[i]));
  }

  //free device buffers
  for (int i = 0; i < nDev; ++i) {
    CUDACHECK(cudaSetDevice(i));
    CUDACHECK(cudaFree(sendbuff[i]));
    CUDACHECK(cudaFree(recvbuff[i]));
  }

  //finalizing NCCL
  for(int i = 0; i < nDev; ++i)
      ncclCommDestroy(comms[i]);

  printf("Success \n");
  return 0;
}