cudaMallocAsync()/cudaFreeAsync() in a multi-threaded environment

Hello

I have a question about using cudaMallocAsync()/cudaFreeAsync() in a multi-threaded environment. I have created two almost identical examples streamsync.cc and devicesync.cc. The only difference is that streamsync.cc is calling cudaStreamSynchronize() whereas devicesync.cc is calling cudaDeviceSynchronize() in the class Context’s destructor.

The example applications take as argument the number of threads to create in “one chunk”. The main thread creates new threads, then waits a bit before creating new threads and so on. A such created thread calls the function foo(), which creates a Context, allocates memory, deallocates memory and destroys the Context.

The example streamsync.cc is working fine. The example devicesync.cc is experiencing a SIGSEGV “after a while” in cudaFreeAsync(). The more threads are being created in “one chunk” the higher the chances for a SIGSEGV ( e.g. using “./devicesynctest 128”).

Is there anything wrong or missing in my example devicesync.cc? ( Calling cudaStreamSynchronize() before and after cudaDeviceSynchronize() in example devicesync.cc leads to a SIGSEGV as well). The example code has been used with a Tesla T4 ( NVIDIA Corporation TU104GL [Tesla T4] (rev a1)) running the latest Cuda version available for download for Ubuntu 22.04).

Thank you for your help.

nvidia-smi

The examples are compiled with the latest 11.8 for Ubuntu 22.04 (Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0).

2347 0 lrwxrwxrwx 1 root root 20 Sep 29 09:22 /lib/x86_64-linux-gnu/libcuda.so.1 -> libcuda.so.520.61.05
Call stack for "gdb --args ./devicesynctest 1" which crashed after some minutes:
Thread 8577 "devicesynctest" received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7fffed622000 (LWP 2625784)]
0x00007ffff6333ea5 in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
(gdb) where
#0  0x00007ffff6333ea5 in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
#1  0x00007ffff63349d8 in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
#2  0x00007ffff64404c6 in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
#3  0x00007ffff6394a14 in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
#4  0x000055555556701a in __cudart906 ()
#5  0x00005555555ae763 in cudaFreeAsync_ptsz ()
#6  0x000055555555f286 in (anonymous namespace)::foo () at /home/finke/develop/cuda/devicesync/devicesync.cc:63
#7  0x0000555555560585 in std::__invoke_impl<void, void (*)()> (__f=@0x555555634a98: 0x55555555f0b3 <(anonymous namespace)::foo()>) at /usr/include/c++/11/bits/invoke.h:61
#8  0x0000555555560543 in std::__invoke<void (*)()> (__fn=@0x555555634a98: 0x55555555f0b3 <(anonymous namespace)::foo()>) at /usr/include/c++/11/bits/invoke.h:96
#9  0x00005555555604e4 in std::thread::_Invoker<std::tuple<void (*)()> >::_M_invoke<0ul> (this=0x555555634a98) at /usr/include/c++/11/bits/std_thread.h:253
#10 0x00005555555604b4 in std::thread::_Invoker<std::tuple<void (*)()> >::operator() (this=0x555555634a98) at /usr/include/c++/11/bits/std_thread.h:260
#11 0x0000555555560494 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)()> > >::_M_run (this=0x555555634a90) at /usr/include/c++/11/bits/std_thread.h:211
#12 0x00007ffff7e5e2b3 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#13 0x00007ffff7bceb43 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#14 0x00007ffff7c60a00 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
(gdb) info thread
  Id   Target Id                                            Frame 
  1    Thread 0x7ffff7a4e000 (LWP 2615499) "devicesynctest" 0x00007ffff7c1f868 in __GI___clock_nanosleep (clock_id=clock_id@entry=0, flags=flags@entry=0, req=0x7fffffffe920, rem=0x7fffffffe920)
    at ../sysdeps/unix/sysv/linux/clock_nanosleep.c:78
  2    Thread 0x7fffeffeb000 (LWP 2615504) "cuda-EvtHandlr" 0x00007ffff7c52d7f in __GI___poll (fds=0x55555562a950, nfds=2, timeout=-1) at ../sysdeps/unix/sysv/linux/poll.c:29
  4    Thread 0x7fffeefe9000 (LWP 2615506) "cuda-EvtHandlr" 0x00007ffff7c52d7f in __GI___poll (fds=0x7fffe4000c20, nfds=11, timeout=100) at ../sysdeps/unix/sysv/linux/poll.c:29
* 8577 Thread 0x7fffed622000 (LWP 2625784) "devicesynctest" 0x00007ffff6333ea5 in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1

devicesync.cc:

#include <cassert>
#include <chrono>
#include <iostream>
#include <mutex>
#include <sstream>
#include <thread>

namespace {

#define THROWMSG( msg ) { \
  std::ostringstream os; \
  os << msg; \
  assert( false); \
  throw std::runtime_error( os.str());\
} \

class Context
{
public:
  Context( int device)
  : currentDevice( 0) {
    cudaError_t err = cudaSetDevice( device);
    if( err != cudaSuccess) {
      THROWMSG("Context::setDevice[" << device << "] error");
    }

    currentStream = cudaStreamPerThread;
    currentDevice = (unsigned int) device;
  }

  ~Context() {
    //throw() in d'tor for the purpose of this example
    
    cudaError_t err = cudaDeviceSynchronize();
    if( err != cudaSuccess) {
      THROWMSG("cudaDeviceSynchronize: could not sync device");
    }
  }

public:
  cudaStream_t   currentStream;
  unsigned int   currentDevice;
};

void foo() {

  std::cout << "start: " << std::this_thread::get_id() << std::endl;

  {
    Context context( 0);
    
    void* ptr = 0;
    const int size = 42;
    cudaError_t err = cudaMallocAsync( &ptr, size, context.currentStream);
    
    if( err != cudaSuccess) {
      THROWMSG( "cudaMallocAsync error allocating "
        << size << " bytes: ["
        << cudaGetErrorName( err) << "]-[" << cudaGetErrorString( err) << "]");
    }
    assert( ptr);
    
    err = cudaFreeAsync( ptr, context.currentStream);

    if( err != cudaSuccess) {    
      THROWMSG( "cudaFreeAsync error: ["
          << cudaGetErrorName( err) << "]-[" << cudaGetErrorString( err) << "]");
    }
  }
  
  std::cout << "end: " << std::this_thread::get_id() << std::endl;
}

}//namespace

int main(int argc, char** argv) {

  if( argc < 2) {
    return 1;
  }
  
  cudaDeviceReset();

  const int items = std::atoi( argv[1]);
  
  for(; ;) {

    for( int j=0; j!=items; ++j) {
      std::thread t( &foo);
      t.detach();
    }
    
    //throttle thread creation a bit
    std::this_thread::sleep_for( std::chrono::milliseconds(42));
  }
  
  //never reached in this case
  cudaDeviceReset();
  
  return 0;
}

compile & link example devicesync.cc 
/usr/local/cuda-11.8/bin/nvcc -v -DDEBUG -O0 -g -Xcompiler -fPIC -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87 -gencode=arch=compute_87,code=compute_87 --default-stream per-thread -ccbin=gcc-11 -x cu --device-c -o devicesync.o -c devicesync.cc &&
/usr/local/cuda-11.8/bin/nvcc -v -DDEBUG -O0 -g -Xcompiler -fPIC -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87 -gencode=arch=compute_87,code=compute_87 --default-stream per-thread -ccbin=gcc-11 --device-link devicesync.o --output-file cuda_devicesync.o &&
g++ devicesync.o cuda_devicesync.o /usr/local/cuda-11.8/lib64/libcudart_static.a -o devicesynctest

streamsync.cc

#include <cassert>
#include <chrono>
#include <iostream>
#include <mutex>
#include <sstream>
#include <thread>

namespace {

#define THROWMSG( msg ) { \
  std::ostringstream os; \
  os << msg; \
  assert( false); \
  throw std::runtime_error( os.str());\
} \

class Context
{
public:
  Context( int device)
  : currentDevice( 0) {
    cudaError_t err = cudaSetDevice( device);
    if( err != cudaSuccess) {
      THROWMSG("Context::setDevice[" << device << "] error");
    }

    currentStream = cudaStreamPerThread;
    currentDevice = (unsigned int) device;
  }

  ~Context() {
    //throw() in d'tor for the purpose of this example
    
    cudaError_t err = cudaStreamSynchronize( currentStream);
    if( err != cudaSuccess) {
      THROWMSG("syncStream: could not sync stream");
    }
  }

public:
  cudaStream_t   currentStream;
  unsigned int   currentDevice;
};

void foo() {

  std::cout << "start: " << std::this_thread::get_id() << std::endl;

  {
    Context context( 0);
    
    void* ptr = 0;
    const int size = 42;
    cudaError_t err = cudaMallocAsync( &ptr, size, context.currentStream);
    
    if( err != cudaSuccess) {
      THROWMSG( "cudaMallocAsync error allocating "
        << size << " bytes: ["
        << cudaGetErrorName( err) << "]-[" << cudaGetErrorString( err) << "]");
    }
    assert( ptr);
    
    err = cudaFreeAsync( ptr, context.currentStream);

    if( err != cudaSuccess) {    
      THROWMSG( "cudaFreeAsync error: ["
          << cudaGetErrorName( err) << "]-[" << cudaGetErrorString( err) << "]");
    }
  }
  
  std::cout << "end: " << std::this_thread::get_id() << std::endl;
}

}//namespace

int main(int argc, char** argv) {

  if( argc < 2) {
    return 1;
  }
  
  cudaDeviceReset();

  const int items = std::atoi( argv[1]);
  
  for(; ;) {

    for( int j=0; j!=items; ++j) {
      std::thread t( &foo);
      t.detach();
    }
    
    //throttle thread creation a bit
    std::this_thread::sleep_for( std::chrono::milliseconds(42));
  }
  
  //never reached in this case
  cudaDeviceReset();
  
  return 0;
}

compile & link example streamsync.cc:
/usr/local/cuda-11.8/bin/nvcc -v -DDEBUG -O0 -g -Xcompiler -fPIC -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87 -gencode=arch=compute_87,code=compute_87 --default-stream per-thread -ccbin=gcc-11 -x cu --device-c -o streamsync.o -c streamsync.cc &&
/usr/local/cuda-11.8/bin/nvcc -v -DDEBUG -O0 -g -Xcompiler -fPIC -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87 -gencode=arch=compute_87,code=compute_87 --default-stream per-thread -ccbin=gcc-11 --device-link streamsync.o --output-file cuda_streamsync.o &&
g++ streamsync.o cuda_streamsync.o /usr/local/cuda-11.8/lib64/libcudart_static.a -o streamsynctest

I can see the same error with driver 520.61.05 .

//nvcc -O0 -g -lineinfo main.cu -o main
#include <cassert>
#include <iostream>
#include <vector>
#include <future>


int main(){
  auto threadfunc = [](){
      cudaSetDevice(0);
      cudaStream_t stream = cudaStreamPerThread;
      void* ptr; 
      cudaError_t status = cudaMallocAsync(&ptr, 1, stream);
      assert(status == cudaSuccess);
      status = cudaFreeAsync(ptr, stream);
      assert(status == cudaSuccess);
      status = cudaDeviceSynchronize();
      //status = cudaStreamSynchronize(stream);
      assert(status == cudaSuccess);
  };

  cudaSetDevice(0);

  int iteration = 0;
  
  while(true){
    std::cerr << "iteration " << iteration << "\n";

    std::vector<std::future<void>> vec;
    for(int i = 0; i < 2; i++){
        vec.emplace_back(std::async(std::launch::async, threadfunc));
    }
    for(auto& f: vec){
      f.wait();
    }

    iteration++;    
  }
  
  return 0;
}

I suggest filing a bug.

Thank you for checking and providing a simplified example.

I have filed, as suggested, a bug: https://developer.nvidia.com/nvidia_bug/3860286

Thanks for the bug, the team has identified the issue and checked in a fix. It will arrive in a future CUDA release. At this time I’m not able to say which.