Deadlock on cudaMalloc and cudaMemcpyDtoH in different thread in the same process

cookielau677 · March 22, 2023, 10:32am

Hi, I met a deadlock problem recently when using NCCL to do some simple send/recv operations, and here I summarise a minimum reproduction version: https://pastebin.com/bM9RNgH5

The deadlock happens when one thread is trying to malloc some memory and another thread is trying to memcpy from device to host, and it has 100% to reproduce.

The final backtrace of gdb of the two threads mentioned above is:

I really want to know the reason why these two operations would have conflict or something else, making the deadlock.

Operation system: Ubuntu 20.04
Cuda version: 11.4
NCCL Version: 2.17.3

Thanks for your help in advance.

Robert_Crovella · March 22, 2023, 3:21pm

please don’t post pictures of text on this forum.

I’m not able to conveniently access pastebin. It should be possible to create a test case and use the forum tools to post it here, inline, in a posting.

cookielau677 · March 23, 2023, 2:07am

Sorry for my ignorance. It’s my first time looking for help here. Here’s the minimum reproducible code:

/*
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$PWD/nccl_2.17.1-1+cuda11.0_x86_64/lib/:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$PWD/nccl_2.17.1-1+cuda11.0_x86_64/lib/:$LIBRARY_PATH
export C_INCLUDE_PATH=/usr/local/cuda/include/:$PWD/nccl_2.17.1-1+cuda11.0_x86_64/include/:$C_INCLUDE_PATH
export CPLUS_INCLUDE_PATH=/usr/local/cuda/include/:$PWD/nccl_2.17.1-1+cuda11.0_x86_64/include/:$CPLUS_INCLUDE_PATH
g++ send_recv.cc -lpthread -lcudart -lnccl
*/
 
#include <unistd.h>
 
#include <cassert>
#include <chrono>
#include <functional>
#include <iostream>
#include <memory>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
 
#include "cuda_runtime.h"
#include "nccl.h"
 
ncclUniqueId ncclId;
 
#define CUDACHECK(cmd)                                              \
  do {                                                              \
    cudaError_t e = cmd;                                            \
    if (e != cudaSuccess) {                                         \
      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
             cudaGetErrorString(e));                                \
      assert(false);                                                \
    }                                                               \
  } while (0)
#define NCCLCHECK(cmd)                                              \
  do {                                                              \
    ncclResult_t r = cmd;                                           \
    if (r != ncclSuccess) {                                         \
      printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \
             ncclGetErrorString(r));                                \
      assert(false);                                                \
    }                                                               \
  } while (0)
 
#define ASYNC 1
 
const int device_count = 4;
 
void custom_recv(int dev_id, int device_count) {
    cudaSetDevice(dev_id);
    ncclComm_t comm;
    #if NCCL_VERSION >= 21700
    ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
    NCCLCHECK(ncclCommInitRankConfig(&comm, device_count, ncclId, dev_id, &config));
    #else
    NCCLCHECK(ncclCommInitRank(&comm, device_count, ncclId, dev_id));
    #endif
    cudaStream_t stream;
    CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    void* buffer;
    CUDACHECK(cudaMalloc(&buffer, sizeof(float)));
 
    ncclGroupStart();
    NCCLCHECK(ncclRecv(buffer, 1, ncclFloat32, 0, comm, stream));
    ncclGroupEnd();
 
    float host_buf;
    #if ASYNC
    CUDACHECK(cudaMemcpyAsync(&host_buf, buffer, sizeof(float), cudaMemcpyDeviceToHost, stream));
    // void* buffer2;
    // CUDACHECK(cudaMallocAsync(&buffer2, sizeof(float)*1024, stream));
    // CUDACHECK(cudaStreamSynchronize(stream));
    #else
    cudaStream_t copy_stream;
    CUDACHECK(cudaStreamCreateWithFlags(&copy_stream, cudaStreamNonBlocking));
    CUDACHECK(cudaMemcpyAsync(&host_buf, buffer, sizeof(float), cudaMemcpyDeviceToHost, copy_stream));
    CUDACHECK(cudaStreamSynchronize(stream));
    #endif
    assert(host_buf == 1);
end:
    std::cout << "device " << dev_id << " recv done" << std::endl;
}
 
int main(int argc, char** argv) {
  {
    NCCLCHECK(ncclGetUniqueId(&ncclId));
 
    // send thread, 0 -> 1 2 3
    std::thread thr0([]() {
      cudaSetDevice(0);
      ncclComm_t comm;
      #if NCCL_VERSION >= 21700
      ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
      NCCLCHECK(ncclCommInitRankConfig(&comm, device_count, ncclId, 0, &config));
      #else
      NCCLCHECK(ncclCommInitRank(&comm, device_count, ncclId, 0));
      #endif
      cudaStream_t stream;
      CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
      void* buffer[device_count];
      for (int i = 1; i < device_count; i++) {
        CUDACHECK(cudaMalloc(&buffer[i], sizeof(float)));
      }
      float r = 1;
      for (int i = 1; i < device_count; i++) {
        CUDACHECK(cudaMemcpy(buffer[i], &r, sizeof(float), cudaMemcpyHostToDevice));
      }
 
      NCCLCHECK(ncclGroupStart());
      for (int i = 1; i < device_count; i++) {
        NCCLCHECK(ncclSend(buffer[i], 1, ncclFloat32, i, comm, stream));
      }
      NCCLCHECK(ncclGroupEnd());
 
      CUDACHECK(cudaStreamSynchronize(stream));
      std::cout << "device 0 send done" << std::endl;
    });
 
    // recv thread: i <- 0
    std::vector<std::thread> threads;
    for (int i = 1; i < device_count; i++) {
      threads.push_back(std::thread(custom_recv, i, device_count));
    }
 
    thr0.join();
    for (auto& thr : threads) {
      thr.join();
    }
  }
}

780437510 · January 4, 2024, 10:52am

I found similar question, and i found using cudaDeviceSychnize() after cudaMalloc it will fix this question , can you fix this question

sushe2111 · January 13, 2024, 9:31am

can you fix this question?

Topic		Replies	Views
cudaLaunchKernel hang in function futex_abstimed_wait CUDA Programming and Performance	6	643	November 6, 2023
cudaMallocAsync()/cudaFreeAsync() in a multi-threaded environment CUDA Programming and Performance cuda , ubuntu	3	1438	December 7, 2022
cudaDeviceSynchronize needed between kernel launch and cudaMemcpy ? CUDA Programming and Performance	15	16282	September 29, 2017
Deadlock that shouldn't occur CUDA Programming and Performance	5	4304	January 27, 2010
Cuda-gdb deadlock in pgi_managed_new CUDA-GDB	15	1082	November 13, 2023
this code resets my computer CUDA Programming and Performance	21	7193	March 29, 2008
Synchronization hangs sporadically after kernel launch CUDA Programming and Performance	23	7823	August 20, 2015
Running into an endless loop, but why? The strange behaviour of __syncthreads() CUDA Programming and Performance	1	1620	January 27, 2010
Device hangs / freezes / crashes under specific circumstances CUDA Programming and Performance cuda , kernel	5	1008	September 1, 2024
Problem of Hash Table Lock in CUDA CUDA Programming and Performance	6	1271	July 16, 2018

Deadlock on cudaMalloc and cudaMemcpyDtoH in different thread in the same process

Related topics