I am trying to use one stream to write data to global memory and another stream to load the data filled in by the first stream back to the host concurrently. The code as following:
__global__ void kernel(volatile int* acc)
{
int tid = threadIdx.x;
acc[tid] = tid;
}
int main()
{
cudaStream_t streams[2];
cudaStreamCreate(&streams[0]);
cudaStreamCreate(&streams[1]);
int* dev_acc;
int* acc = new int[64];
cudaMalloc(&dev_acc, sizeof(int) * 64);
kernel << < 1, 64, 0, streams[0] >> > (dev_acc);
for (int i = 0; i < 64; i++)
{
int temp = 0;
int temp_2 = 0;
cudaMemcpyAsync(&temp, dev_acc + i, sizeof(int), cudaMemcpyDeviceToHost, streams[1]);
cudaMemcpyAsync(&temp_2, dev_acc + i + 1, sizeof(int), cudaMemcpyDeviceToHost, streams[1]);
cudaStreamSynchronize(streams[1]);
if (i != 63) printf("acc[%d] = %d, acc[%d] = %d\n", i, temp, i+1, temp_2);
else printf("acc[%d] = %d\n", i, temp);
}
cudaStreamSynchronize(streams[0]);
cudaMemcpy(acc, dev_acc, sizeof(int) * 64, cudaMemcpyDeviceToHost);
for (int i = 0; i < 64; i++)
{
printf("acc[%d] = %d\n", i, acc[i]);
}
delete[] acc;
cudaDeviceReset();
return 0;
}
However, the output showed that all the data printed within the for loop at line 21 equals to 0. Yet the memory copy after stream synchronizing at line 36 gave me the correctly filled up data set, i.e. acc[0] = 0, acc[1] = 1, acc[n] = n;
The profiler timeline analysis:
https://drive.google.com/open?id=0B-0UQdS3Z6R-cHF4MlVqYkZXV2M
where stream13 running kernel, 14 doing async memory copy, 15 is the default stream doing memory copy at line 17 and 34.
From the profile we can see that the kernel is finished before the async mem copy, thus data filling should be completed before the first time we copy the data back to host. Then why is all the data printed at line 28 equal to 0(the initialized value with cudamalloc)?
My guess is the writes done by stream 13 are still in caches instead of the global memory before the occurrence of first async memory copy. Is there a way to make sure the data goes to the device memory instead of staying in cache before executing next command in the device code?
Program is compiled using CUDA 7.5 runtime on Windows 10 with GTX980M.
Thanks in advance.