cudaLaunchCooperativeKernelMultiDevice is deprecated. What is the alternative?

Hi,
in the CUDA programming guide v11.4.1 , Section C.8 it says
"Deprecation Notice: cudaLaunchCooperativeKernelMultiDevice has been deprecated in CUDA 11.3 for all devices. Example of an alternative approach can be found in the multi device conjugate gradient sample. "

Could somebody clarify what is the alternative approach for multi device cooperative launch?

I think I found the conjugateGradientMultiDeviceCG sample on github, cuda-samples/conjugateGradientMultiDeviceCG.cu at master · NVIDIA/cuda-samples · GitHub
However, it uses the same cudaLaunchCooperativeKernelMultiDevice and does not show a different approach.

The code you found on Github may be from an older CUDA toolkit release. If you acquire a complete download for CUDA 11.4 (Update 1) toolkit & samples, does it still use the deprecated function?

EDIT: It appears on Github they have updated most files in that code sample subdirectory for CUDA 11.4 release, but this commit did not touch the code of this .cu file.

I have installed the samples from the current runfile installer. There the workaround is shown. Thank you.
Basically, one needs to use ordinary single-device grid groups, and use your own implementation of a multi-grid group.

// Data filled on CPU needed for MultiGPU operations.
struct MultiDeviceData {
  unsigned char *hostMemoryArrivedList;
  unsigned int  numDevices;
  unsigned int  deviceRank;
};

// Class used for coordination of multiple devices.
class PeerGroup {
  const MultiDeviceData &data;
  const cg::grid_group &grid;

  __device__ unsigned char load_arrived(unsigned char *arrived) const {
#if __CUDA_ARCH__ < 700
      return *(volatile unsigned char *)arrived;
#else
      unsigned int result;
      asm volatile ("ld.acquire.sys.global.u8 %0, [%1];" : "=r"(result) : "l"(arrived) : "memory");
      return result;
#endif
  }

  __device__ void store_arrived(unsigned char *arrived, unsigned char val) const {
#if __CUDA_ARCH__ < 700
      *(volatile unsigned char *)arrived = val;
#else
      unsigned int reg_val = val;
      asm volatile ("st.release.sys.global.u8 [%1], %0;" :: "r"(reg_val) "l"(arrived) : "memory");

      // Avoids compiler warnings from unused variable val.
      (void)(reg_val = reg_val);
#endif
  }

  public:
  __device__ PeerGroup(const MultiDeviceData &data, const cg::grid_group &grid) : data(data), grid(grid) {};

  __device__ unsigned int size() const {
      return data.numDevices * grid.size();
  }

  __device__ unsigned int thread_rank() const {
      return data.deviceRank * grid.size() + grid.thread_rank();
  }

  __device__ void sync() const {
    grid.sync();

    // One thread from each grid participates in the sync.
    if (grid.thread_rank() == 0) {
      if (data.deviceRank == 0) {
        // Leader grid waits for others to join and then releases them.
        // Other GPUs can arrive in any order, so the leader have to wait for all others.
        for (int i = 0; i < data.numDevices - 1; i++) {
          while(load_arrived(&data.hostMemoryArrivedList[i]) == 0);
        }
        for (int i = 0; i < data.numDevices - 1; i++) {
          store_arrived(&data.hostMemoryArrivedList[i], 0);
        }
        __threadfence_system();
      }
      else {
        // Other grids note their arrival and wait to be released.
        store_arrived(&data.hostMemoryArrivedList[data.deviceRank - 1], 1);
        while(load_arrived(&data.hostMemoryArrivedList[data.deviceRank - 1]) == 1);
      }
    }

    grid.sync();
  }
};

...

__global__ void multiGpuConjugateGradient(
    ..., MultiDeviceData multi_device_data) {

  cg::grid_group grid = cg::this_grid();
  PeerGroup peer_group(multi_device_data, grid);
  ...
   peer_group.sync();
    ...
}

...

//in main function

// Structure used for cross-grid synchronization.
  MultiDeviceData multi_device_data;
  checkCudaErrors(cudaHostAlloc(&multi_device_data.hostMemoryArrivedList,
                                (kNumGpusRequired - 1) * sizeof(*multi_device_data.hostMemoryArrivedList), cudaHostAllocPortable));
  memset(multi_device_data.hostMemoryArrivedList, 0, (kNumGpusRequired - 1) * sizeof(*multi_device_data.hostMemoryArrivedList));
  multi_device_data.numDevices      = kNumGpusRequired;
  multi_device_data.deviceRank        = 0;

  void *kernelArgs[] = {
      ..., (void *)&multi_device_data,
  };

  while (deviceId != bestFitDeviceIds.end()) {
    checkCudaErrors(cudaSetDevice(*deviceId));
    checkCudaErrors(cudaLaunchCooperativeKernel((void*)multiGpuConjugateGradient, dimGrid, dimBlock,
                                                kernelArgs, sMemSize, nStreams[device_count++]));
    multi_device_data.deviceRank++;
    deviceId++;
  }

The github version is now updated also.

2 Likes