How to use cudaImportExternalSemaphore in Linux?

Is there a way to create a Semaphore in CPU memory in Linux and use it in CUDA? I want to block all operations on a stream until the Semaphore is signaled by CPU code.

Documentation seems to suggest cudaImportExternalSemaphore can be used to import an external Semaphore and cudaWaitExternalSemaphoresAsync can be used make a stream wait until the Semaphore is signaled. But there is no example that shows how to import a Semaphore created in Linux.

What semaphore can be imported? Posix sem_t? Is there an example someone can point to? I see some examples that use Vulcan. Can semaphores created in Vulcan only be imported?

The document also says “It is illegal to issue a wait before the corresponding signal has been issued.” Does this mean, it is illegal to wait on the semaphore before the Semaphore is signaled? I’m trying to understand what is the benefit of using a Semaphore if wait cannot be called before the semaphore is signaled.

Thanks,
Indu

The programming guide shows how to use this API. Programming Guide :: CUDA Toolkit Documentation
It seems it cannot be used for arbitrary semaphores, but only ones exported by vulkan, opengl, direct3d, and nvsci.

I tried to import an eventfd-based semaphore, but this just gives unknown error


#include <future>
#include <iostream>
#include <cassert>
#include <chrono>

#include <sys/eventfd.h>
#include <unistd.h>

static_assert(sizeof(size_t) == 8);

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__
void printkernel(){
    printf("printkernel started\n");
}

void workerthread1(int semaphorefd){
    std::cerr << "workerthread1 started\n";
    std::this_thread::sleep_for(std::chrono::seconds{2});

    size_t val = 1;
    int numbyteswritten = write(semaphorefd, &val, 8);
    assert(numbyteswritten == 8);
}

int main(){

    int semfd = eventfd(0, EFD_SEMAPHORE);
    if(semfd == -1){
        perror("eventfd");
        assert(false);
    }

    auto future = std::async(std::launch::async, workerthread1, semfd);

    std::cerr << "waiting for semaphore\n";
    size_t semaphorevalue; 
    int numbytesreads = read(semfd, &semaphorevalue, 8);
    assert(numbytesreads == 8);

    std::cerr << "waited for semaphore\n";
    future.wait();

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    cudaExternalSemaphore_t externalSem;
    cudaExternalSemaphoreHandleDesc externSemDesc;
    memset(&externSemDesc, 0, sizeof(cudaExternalSemaphoreHandleDesc));
    externSemDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd;
    externSemDesc.handle.fd = semfd;

    gpuErrchk(cudaImportExternalSemaphore (&externalSem, &externSemDesc));  //does not work

    cudaExternalSemaphoreWaitParams waitParams;
    memset(&waitParams, 0, sizeof(cudaExternalSemaphoreWaitParams));
    gpuErrchk(cudaWaitExternalSemaphoresAsync(&externalSem, &waitParams, 1, stream)); 
    printkernel<<<1,1,0,stream>>>();

    auto future2 = std::async(std::launch::async, workerthread1, semfd);

    gpuErrchk(cudaDeviceSynchronize());

    future2.wait();
}

Do you really need a semaphore? You can use cudaLaunchHostFunc to have stream-ordered cpu work which cuda events can be used on to synchronize two streams.

I tried cudaLaunchHostFunc but it appears two host functions cannot execute at the same time. So, if I run host_fn_wait_for_semaphore in stream_one and then run host_fn_signal_semaphore in stream_two, the program hangs because CUDA runtime executes host_fn_signal_semaphore only after host_fn_wait_for_semaphore returns (which never happens because it is waiting on a semaphore).

Here is a simple example showing this problem. host_fn_one on stream_one sleeps for 10 sec and then sets an atomic boolean to true. host_fn_two on stream_two simply reads the boolean. You can see that host_fn_two runs only after 10 seconds after host_fn_one is complete.

Is there a way to make multiple host functions execute concurrently?

#include <cuda_runtime.h>

#include <atomic>
#include <cassert>
#include <chrono>
#include <cstring>
#include <ctime>
#include <iostream>
#include <thread>

#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
  if( e != cudaSuccess ) {                          \
    printf("Failed: Cuda error %s:%d '%s'\n",       \
        __FILE__,__LINE__,cudaGetErrorString(e));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0) 

const char* now() {
  std::time_t now = std::time(nullptr);
  return std::ctime(&now);
}

void CUDART_CB host_fn_one(void* arg) {
  std::atomic_bool* stream_one_done = (std::atomic_bool*)arg;

  std::cout << now() << "stream_one sleeping for 10 sec" << std::endl;
  std::this_thread::sleep_for(std::chrono::seconds(10));

  stream_one_done->store(true);
  std::cout << now() << "stream_one woke up and set stream_one_done to true"
            << std::endl;
}

void CUDART_CB host_fn_two(void* arg) {
  std::atomic_bool* stream_one_done = (std::atomic_bool*)arg;
  std::cout << now() << "stream_two finds stream_one_done set to "
            << stream_one_done->load() << std::endl;
}

int main(int argc, char* argv[]) {
  CUDACHECK(cudaSetDevice(0));

  std::atomic_bool stream_one_done{false};

  cudaStream_t stream_one, stream_two;
  CUDACHECK(cudaStreamCreateWithFlags(&stream_one, cudaStreamNonBlocking));
  CUDACHECK(cudaStreamCreateWithFlags(&stream_two, cudaStreamNonBlocking));

  cudaLaunchHostFunc(stream_one, host_fn_one, &stream_one_done);
  cudaLaunchHostFunc(stream_two, host_fn_two, &stream_one_done);

  cudaDeviceSynchronize();
}

Output:

$ ./two_host_func 
Wed Jan 19 00:17:43 2022
stream_one sleeping for 10 sec
Wed Jan 19 00:17:53 2022
stream_one woke up and set stream_one_done to true
Wed Jan 19 00:17:53 2022
stream_two finds stream_one_done set to 1

CUDA makes no guaranties about host functions besides stream semantics. The order of host functions in different streams is unspecified. Whether or not they run concurrently is unspecified.

Of course, you can always simply block your main thread until all CPU work is complete, before submitting more CUDA work to the stream. The same will happen with CUDA host functions. At some point, the driver thread needs to block and wait until the host function is finished to guarantee stream ordering.

If you want “parallel cuda host functions”, simply launch your worker threads from within a cuda host function

#include <atomic>
#include <cassert>
#include <chrono>
#include <cstring>
#include <ctime>
#include <iostream>
#include <thread>
#include <future>

#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
  if( e != cudaSuccess ) {                          \
    printf("Failed: Cuda error %s:%d '%s'\n",       \
        __FILE__,__LINE__,cudaGetErrorString(e));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0) 

const char* now() {
  std::time_t now = std::time(nullptr);
  return std::ctime(&now);
}

void worker_one(void* arg) {
  std::atomic_bool* stream_one_done = (std::atomic_bool*)arg;

  std::cout << now() << "stream_one sleeping for 10 sec" << std::endl;
  std::this_thread::sleep_for(std::chrono::seconds(10));

  stream_one_done->store(true);
  std::cout << now() << "stream_one woke up and set stream_one_done to true"
            << std::endl;
}

void worker_two(void* arg) {
  std::atomic_bool* stream_one_done = (std::atomic_bool*)arg;
  std::cout << now() << "stream_two finds stream_one_done set to "
            << stream_one_done->load() << std::endl;
}

void CUDART_CB host_fn(void* arg) {
  auto f1 = std::async(std::launch::async, worker_one, arg);
  auto f2 = std::async(std::launch::async, worker_two, arg);

  f1.wait();
  f2.wait();
}

int main(int argc, char* argv[]) {
  CUDACHECK(cudaSetDevice(0));

  std::atomic_bool stream_one_done{false};

  cudaStream_t stream_one, stream_two;
  CUDACHECK(cudaStreamCreateWithFlags(&stream_one, cudaStreamNonBlocking));
  CUDACHECK(cudaStreamCreateWithFlags(&stream_two, cudaStreamNonBlocking));

  cudaLaunchHostFunc(stream_one, host_fn, &stream_one_done);

  cudaDeviceSynchronize();
}

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.