How to use cudaImportExternalSemaphore in Linux?

indhub · January 18, 2022, 6:23pm

Is there a way to create a Semaphore in CPU memory in Linux and use it in CUDA? I want to block all operations on a stream until the Semaphore is signaled by CPU code.

Documentation seems to suggest cudaImportExternalSemaphore can be used to import an external Semaphore and cudaWaitExternalSemaphoresAsync can be used make a stream wait until the Semaphore is signaled. But there is no example that shows how to import a Semaphore created in Linux.

What semaphore can be imported? Posix sem_t? Is there an example someone can point to? I see some examples that use Vulcan. Can semaphores created in Vulcan only be imported?

The document also says “It is illegal to issue a wait before the corresponding signal has been issued.” Does this mean, it is illegal to wait on the semaphore before the Semaphore is signaled? I’m trying to understand what is the benefit of using a Semaphore if wait cannot be called before the semaphore is signaled.

Thanks,
Indu

striker159 · January 18, 2022, 7:25pm

The programming guide shows how to use this API. https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#external-resource-interoperability
It seems it cannot be used for arbitrary semaphores, but only ones exported by vulkan, opengl, direct3d, and nvsci.

I tried to import an eventfd-based semaphore, but this just gives unknown error


#include <future>
#include <iostream>
#include <cassert>
#include <chrono>

#include <sys/eventfd.h>
#include <unistd.h>

static_assert(sizeof(size_t) == 8);

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__
void printkernel(){
    printf("printkernel started\n");
}

void workerthread1(int semaphorefd){
    std::cerr << "workerthread1 started\n";
    std::this_thread::sleep_for(std::chrono::seconds{2});

    size_t val = 1;
    int numbyteswritten = write(semaphorefd, &val, 8);
    assert(numbyteswritten == 8);
}

int main(){

    int semfd = eventfd(0, EFD_SEMAPHORE);
    if(semfd == -1){
        perror("eventfd");
        assert(false);
    }

    auto future = std::async(std::launch::async, workerthread1, semfd);

    std::cerr << "waiting for semaphore\n";
    size_t semaphorevalue; 
    int numbytesreads = read(semfd, &semaphorevalue, 8);
    assert(numbytesreads == 8);

    std::cerr << "waited for semaphore\n";
    future.wait();

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    cudaExternalSemaphore_t externalSem;
    cudaExternalSemaphoreHandleDesc externSemDesc;
    memset(&externSemDesc, 0, sizeof(cudaExternalSemaphoreHandleDesc));
    externSemDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd;
    externSemDesc.handle.fd = semfd;

    gpuErrchk(cudaImportExternalSemaphore (&externalSem, &externSemDesc));  //does not work

    cudaExternalSemaphoreWaitParams waitParams;
    memset(&waitParams, 0, sizeof(cudaExternalSemaphoreWaitParams));
    gpuErrchk(cudaWaitExternalSemaphoresAsync(&externalSem, &waitParams, 1, stream)); 
    printkernel<<<1,1,0,stream>>>();

    auto future2 = std::async(std::launch::async, workerthread1, semfd);

    gpuErrchk(cudaDeviceSynchronize());

    future2.wait();
}

Do you really need a semaphore? You can use cudaLaunchHostFunc to have stream-ordered cpu work which cuda events can be used on to synchronize two streams.

indhub · January 19, 2022, 12:19am

I tried cudaLaunchHostFunc but it appears two host functions cannot execute at the same time. So, if I run host_fn_wait_for_semaphore in stream_one and then run host_fn_signal_semaphore in stream_two, the program hangs because CUDA runtime executes host_fn_signal_semaphore only after host_fn_wait_for_semaphore returns (which never happens because it is waiting on a semaphore).

Here is a simple example showing this problem. host_fn_one on stream_one sleeps for 10 sec and then sets an atomic boolean to true. host_fn_two on stream_two simply reads the boolean. You can see that host_fn_two runs only after 10 seconds after host_fn_one is complete.

Is there a way to make multiple host functions execute concurrently?

#include <cuda_runtime.h>

#include <atomic>
#include <cassert>
#include <chrono>
#include <cstring>
#include <ctime>
#include <iostream>
#include <thread>

#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
  if( e != cudaSuccess ) {                          \
    printf("Failed: Cuda error %s:%d '%s'\n",       \
        __FILE__,__LINE__,cudaGetErrorString(e));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0) 

const char* now() {
  std::time_t now = std::time(nullptr);
  return std::ctime(&now);
}

void CUDART_CB host_fn_one(void* arg) {
  std::atomic_bool* stream_one_done = (std::atomic_bool*)arg;

  std::cout << now() << "stream_one sleeping for 10 sec" << std::endl;
  std::this_thread::sleep_for(std::chrono::seconds(10));

  stream_one_done->store(true);
  std::cout << now() << "stream_one woke up and set stream_one_done to true"
            << std::endl;
}

void CUDART_CB host_fn_two(void* arg) {
  std::atomic_bool* stream_one_done = (std::atomic_bool*)arg;
  std::cout << now() << "stream_two finds stream_one_done set to "
            << stream_one_done->load() << std::endl;
}

int main(int argc, char* argv[]) {
  CUDACHECK(cudaSetDevice(0));

  std::atomic_bool stream_one_done{false};

  cudaStream_t stream_one, stream_two;
  CUDACHECK(cudaStreamCreateWithFlags(&stream_one, cudaStreamNonBlocking));
  CUDACHECK(cudaStreamCreateWithFlags(&stream_two, cudaStreamNonBlocking));

  cudaLaunchHostFunc(stream_one, host_fn_one, &stream_one_done);
  cudaLaunchHostFunc(stream_two, host_fn_two, &stream_one_done);

  cudaDeviceSynchronize();
}

Output:

$ ./two_host_func 
Wed Jan 19 00:17:43 2022
stream_one sleeping for 10 sec
Wed Jan 19 00:17:53 2022
stream_one woke up and set stream_one_done to true
Wed Jan 19 00:17:53 2022
stream_two finds stream_one_done set to 1

striker159 · January 19, 2022, 10:39am

CUDA makes no guaranties about host functions besides stream semantics. The order of host functions in different streams is unspecified. Whether or not they run concurrently is unspecified.

Of course, you can always simply block your main thread until all CPU work is complete, before submitting more CUDA work to the stream. The same will happen with CUDA host functions. At some point, the driver thread needs to block and wait until the host function is finished to guarantee stream ordering.

If you want “parallel cuda host functions”, simply launch your worker threads from within a cuda host function

#include <atomic>
#include <cassert>
#include <chrono>
#include <cstring>
#include <ctime>
#include <iostream>
#include <thread>
#include <future>

#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
  if( e != cudaSuccess ) {                          \
    printf("Failed: Cuda error %s:%d '%s'\n",       \
        __FILE__,__LINE__,cudaGetErrorString(e));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0) 

const char* now() {
  std::time_t now = std::time(nullptr);
  return std::ctime(&now);
}

void worker_one(void* arg) {
  std::atomic_bool* stream_one_done = (std::atomic_bool*)arg;

  std::cout << now() << "stream_one sleeping for 10 sec" << std::endl;
  std::this_thread::sleep_for(std::chrono::seconds(10));

  stream_one_done->store(true);
  std::cout << now() << "stream_one woke up and set stream_one_done to true"
            << std::endl;
}

void worker_two(void* arg) {
  std::atomic_bool* stream_one_done = (std::atomic_bool*)arg;
  std::cout << now() << "stream_two finds stream_one_done set to "
            << stream_one_done->load() << std::endl;
}

void CUDART_CB host_fn(void* arg) {
  auto f1 = std::async(std::launch::async, worker_one, arg);
  auto f2 = std::async(std::launch::async, worker_two, arg);

  f1.wait();
  f2.wait();
}

int main(int argc, char* argv[]) {
  CUDACHECK(cudaSetDevice(0));

  std::atomic_bool stream_one_done{false};

  cudaStream_t stream_one, stream_two;
  CUDACHECK(cudaStreamCreateWithFlags(&stream_one, cudaStreamNonBlocking));
  CUDACHECK(cudaStreamCreateWithFlags(&stream_two, cudaStreamNonBlocking));

  cudaLaunchHostFunc(stream_one, host_fn, &stream_one_done);

  cudaDeviceSynchronize();
}

system · February 2, 2022, 10:39am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
How to create a semaphore importable by cuImportExternalSemaphore? CUDA Programming and Performance cuda , synchronization	7	1120	February 5, 2021
Call to cuImportExternalSemaphore always fails CUDA Programming and Performance	0	25	May 21, 2025
cudaWaitExternalSemaphoresAsync blocks CPU Kernel launch Vulkan	3	48	April 3, 2025
Failed to import semaphore from Vulkan to Cuda CUDA Programming and Performance cuda , vulkan	0	626	June 3, 2021
cudaWaitExternalSemaphoresAsync blocks CPU Kernel launch CUDA Programming and Performance	3	60	April 3, 2025
Vulkan External Semaphore Behavior in Simple Vulkan CUDA Example Vulkan	1	1280	July 20, 2022
How to implement semaphores in CUDA? I tried with atomicExch(), but got a Run-time error CUDA Programming and Performance	6	21954	February 2, 2009
Signalling between cuda thread and host thread cuda-thread host-thread communication CUDA Programming and Performance	7	19634	June 21, 2011
Where cudaWaitExternalSemaphoresAsync_v2 comes from? CUDA Setup and Installation cuda	0	1129	July 6, 2021
Inter process communication problem Stuck! CUDA Programming and Performance	11	15163	August 26, 2007

How to use cudaImportExternalSemaphore in Linux?

Related topics