tmurray,
The original code was wrapped in a Fortran application. Here is the code I wrote according to that, just to show the problem:
Actually, after looking at the results of this small function, I found even with the while loop, the second stream still has wrong results.
#include <stdio.h>
#include <cutil_inline.h>
#include <sys/types.h>
#include <sys/time.h>
#include <assert.h>
#include <time.h>
#include “cuda.h”
#include “util.h”
global void kernel(double* t, int i)
{
t[threadIdx.x+i*128] ++;
__syncthreads();
}
int main()
{
cudaStream_t streams;
size_t nstreams=2;
double t, t_p, t_d;
int n = 256, i, size_block_t=nsizeof(double)/nstreams;
dim3 dimBlock(128,1);
dim3 dimGrid(1,1);
streams = (cudaStream_t ) malloc(nstreams * sizeof(cudaStream_t));
t = (double)malloc(nsizeof(double));
cudaMallocHost((void)&t_p, nsizeof(double));
cudaMalloc((void**)&t_d, nsizeof(double));
for(i=0;i<n;i++)t[i]=2.0;
for(i=0;i<n;i++)printf(“t[%d] is %lf\n”, i, t[i]);
assert(streams != NULL);
for (i = 0; i < nstreams; ++i) {
cutilSafeCall(cudaStreamCreate(&streams[i]));
}
for (i = 0; i < nstreams; ++i) {
cutilSafeCall(cudaMemcpyAsync(((char *) t_p) + i * size_block_t, ((char *) t) + i * size_block_t, size_block_t, cudaMemcpyHostToHost, streams[i]));
cutilSafeCall(cudaMemcpyAsync(((char *) t_d) + i * size_block_t, ((char *) t_p) + i * size_block_t, size_block_t, cudaMemcpyHostToDevice, streams[i]));
kernel<<<dimGrid,dimBlock,0,streams[i]>>>(t_d, i);
cutilCheckMsg("Kernel execution failed");
cutilSafeCall(cudaMemcpyAsync(((char *) t_p) + i * size_block_t, ((char *) t_d) + i * size_block_t, size_block_t, cudaMemcpyDeviceToHost, streams[i]));
while (cudaStreamQuery(streams[i]) != cudaSuccess); // wait
cutilSafeCall(cudaMemcpyAsync(((char *) t) + i * size_block_t, ((char *) t_p) + i * size_block_t, size_block_t, cudaMemcpyHostToHost, streams[i]));
}
for(i=0;i<nstreams;i++)while (cudaStreamQuery(streams[i]) != cudaSuccess); // wait
cudaThreadSynchronize();
for(i=0;i<n;i++)printf("t[%d] is %lf\n",i, t[i]);
cudaFreeHost(t_p);
cudaFree(t_d);
free(t);
return 0;
}