Hi,
I’m trying to build an asynchonous pipeline I need to call host function “cudaLaunchHostFunc” would probably nice, however I need to synchonize the stream to get consistent results in the following code
include <cuda_runtime.h>
include
include
[code]
global void vectorAdd( int * buff, int numElements, int expected) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) {
int val = buff[i];
if (val == expected)
buff[i] = val + 1;
else
buff[i] = -val;
}
}
struct simple_add_with_check_data
{
int* buff;
int expected;
int size;
};
void simple_add_with_check(void* data)
{
simple_add_with_check_data* datac = reinterpret_cast<simple_add_with_check_data*>(data);
for (int globalIdX = 0; globalIdX < datac->size; globalIdX++)
{
int val = datac->buff[globalIdX];
if (val == datac->expected)
datac->buff[globalIdX] = val + 1;
else
datac->buff[globalIdX] = -val;
}
}
int main(void)
{
cudaError_t err = cudaSuccess;
int numElements = 50000;
size_t size = numElements * sizeof(float);
int* h_A;
cudaMallocHost(&h_A,size);
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
cudaStream_t stream[1];
for (int i = 0; i < 1; ++i)
cudaStreamCreate(&stream[i]);
int it = 32;
int expected = 0;
for (int i = 0; i < it; i++)
{
if (expected % 2 == 0) {
vectorAdd << <blocksPerGrid, threadsPerBlock,0, stream[0] >> > (h_A, numElements,expected);
err = cudaGetLastError();
}
else
{
//cudaStreamSynchronize(stream[0]);
simple_add_with_check_data data;
data.expected = expected;
data.size = numElements;
data.buff = h_A;
cudaLaunchHostFunc(stream[0], &simple_add_with_check, &data);
}
expected++;
}
cudaStreamSynchronize(stream[0]);
bool res = std::all_of(h_A, h_A + numElements, [expected](int i) {return i == expected; });
if (res)
std::cout << “OK\n”;
else
std::cout << “NOT OK\n”;
return 0;
}
[\code]
It kinds of remove the good part of cudaLaunchostFunction, am I missing something ?
Thanks,
Anarchitectus