cudaLaunchHostFunc requires cudaStreamSynchronize

Hi,

I’m trying to build an asynchonous pipeline I need to call host function “cudaLaunchHostFunc” would probably nice, however I need to synchonize the stream to get consistent results in the following code

include <cuda_runtime.h>
include
include

[code]
global void vectorAdd( int * buff, int numElements, int expected) {
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i < numElements) {
int val = buff[i];

  if (val == expected)
      buff[i] = val + 1;
  else
      buff[i] = -val;

}
}

struct simple_add_with_check_data
{
int* buff;
int expected;
int size;
};

void simple_add_with_check(void* data)
{
simple_add_with_check_data* datac = reinterpret_cast<simple_add_with_check_data*>(data);

for (int globalIdX = 0; globalIdX < datac->size; globalIdX++)
{
    int val = datac->buff[globalIdX];

    if (val == datac->expected)
        datac->buff[globalIdX] = val + 1;
    else
        datac->buff[globalIdX] = -val;
}

}

int main(void)
{
cudaError_t err = cudaSuccess;
int numElements = 50000;
size_t size = numElements * sizeof(float);

int* h_A;
cudaMallocHost(&h_A,size);

int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
cudaStream_t stream[1];
for (int i = 0; i < 1; ++i)
cudaStreamCreate(&stream[i]);
int it = 32;
int expected = 0;
for (int i = 0; i < it; i++)
{

  if (expected % 2 == 0) {
      vectorAdd << <blocksPerGrid, threadsPerBlock,0, stream[0] >> > (h_A, numElements,expected);
      err = cudaGetLastError();
  }
  else
  {
      //cudaStreamSynchronize(stream[0]);
      simple_add_with_check_data data;
      data.expected = expected;
      data.size = numElements;
      data.buff = h_A;
      cudaLaunchHostFunc(stream[0], &simple_add_with_check, &data);
  }
  expected++;

}

cudaStreamSynchronize(stream[0]);

bool res = std::all_of(h_A, h_A + numElements, [expected](int i) {return i == expected; });

if (res)
std::cout << “OK\n”;
else
std::cout << “NOT OK\n”;

return 0;
}
[\code]

It kinds of remove the good part of cudaLaunchostFunction, am I missing something ?

Thanks,

Anarchitectus

Yes, you need to synchronize at the end to ensure that all cuda streams have completed.

{
      //cudaStreamSynchronize(stream[0]);
      simple_add_with_check_data data;
      data.expected = expected;
      data.size = numElements;
      data.buff = h_A;
      cudaLaunchHostFunc(stream[0], &simple_add_with_check, &data);
  }

This code is broken. You need to make sure that data is valid until the host callback finishes execution. The way it is currently, data will be destroyed before because it goes out of scope. The solution would be to allocate the data on the heap, and to delete it at the end of the callback function.

indeed, realized that too late ! thanks