// prefetch first tile
cudaMemPrefetchAsync(a, tile_size * sizeof(size_t), 0, s2);
cudaEventRecord(e1, s2);
for (int i = 0; i < num_tiles; i++) {
// make sure previous kernel and current tile copy both completed
cudaEventSynchronize(e1);
cudaEventSynchronize(e2);
// run multiple kernels on current tile
for (int j = 0; j < num_kernels; j++)
kernel<<<1024, 1024, 0, s1>>>(tile_size, a + tile_size * i);
cudaEventRecord(e1, s1);
// prefetch next tile to the gpu in a separate stream
if (i < num_tiles-1) {
// make sure the stream is idle to force non-deferred HtoD prefetches first
cudaStreamSynchronize(s2);
cudaMemPrefetchAsync(a + tile_size * (i+1), tile_size * sizeof(size_t), 0, s2);
cudaEventRecord(e2, s2);
}
// offload current tile to the cpu after the kernel is completed using the deferred path
cudaMemPrefetchAsync(a + tile_size * i, tile_size * sizeof(size_t), cudaCpuDeviceId, s1);
// rotate streams and swap events
st = s1; s1 = s2; s2 = st;
st = s2; s2 = s3; s3 = st;
et = e1; e1 = e2; e2 = et;
}
Its an ordinary double buffer approach. Processing always happens in stream s1, copying happens in stream s2.
Then you swap streams such that in the next iteration processing happens in the old stream s2, which is now s1.