I have simple code cycle in host, my cuda kernel perform some task and fill dev_found variable if true then exit cycle. But MemCopy Device to Host take some time on every cycle. How to avoid use host variable
for (int i = 0; i < 1000000; i++)
{
find_pivot_row << <numBlocks, blockSize >> > (dev_m, dev_b, dev_data, dev_index, dev_found);
bool found;
cudaMemcpy(&found, dev_found, sizeof(bool), cudaMemcpyDeviceToHost);
if (!found)
{
break;
}
}