Hi,
Can someone explain why there is no cudaStreamSynchronize in the “time execution with nstreams streams” part of the code.
I was expecting
for(int k = 0; k < nreps; k++)
{
// asynchronously launch nstreams kernels, each operating on its own portion of data
for(int i = 0; i < nstreams; i++)
init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_c, niterations);
// asynchronoously launch nstreams memcopies. Note that memcopy in stream x will only
// commence executing when all previous CUDA calls in stream x have completed
for(int i = 0; i < nstreams; i++)
cudaMemcpyAsync(a + i * n / nstreams, d_a + i * n / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]);
///////// THIS PART MISSING ?????
for(int i = 0; i < nstreams; i++)
cudaStreamSynchronize(stream[i]);
}
Thanks
JE