I want to increase the performance of my program by overlapping memory copy and kernel using Nstreams as follow:-

memset(a, 255, nbytes); // set host memory bits to all 1s, for testing correctness
cudaMemset(d_a, 0, nbytes); // set device memory to all 0s, for testing correctness
cudaEventRecord(start_event, 0);
for(int k = 0; k < nreps; k++)
// asynchronously launch nstreams kernels, each operating on its own portion of data
for(int i = 0; i < nstreams; i++)
init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_c, niterations);

// asynchronoously launch nstreams memcopies. Note that memcopy in stream x will only
// commence executing when all previous CUDA calls in stream x have completed
for(int i = 0; i < nstreams; i++)
cudaMemcpyAsync(a + i * n / nstreams, d_a + i * n / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]);
cudaEventRecord(stop_event, 0);
CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );
printf("%d streams:\t%.2f (%.2f expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams);

Thats for one kernel and one copy, could any one tell me how to do it on many copies and many kernels, shall I loop over the copies and then loop over the kernels? and increase the ‘#0’ in execution configuration in kernel2 by 1 or how?

Thank you for help