Question about cudaMemcpyAsync Are they ordered?

I have the following code sequence:

for (i=0;i<nstream;i++)
{
cudaMemcpyAsync(…, HostToHost, stream[i]);
cudaMemcpyAsync(…, HostToDevice, stream[i]);
cudaMemcpyAsync(…, DeviceToHost, stream[i]);
cudaMemcpyAsync(…, HostToHost, stream[i]);
}

Are the copies inside one stream ordered? It looks like the last hosttohost copy does not wait until the previous DeviceToHost copy finishes. So all those copies are out-of-order?

Copies within a stream are ordered. If you think you’ve found a bug, please post a full repro.

Yes, I think so. Ah… how (and where) can I post a full report?

Just post it here, I’ll do the necessary legwork.

This is the code segment:

for (i = 0; i < nstreams; ++i) {

    cutilSafeCall(cudaMemcpyAsync(((char *) t_p) + i * size_block_t, ((char *) t) + i * size_block_t, size_block_t, cudaMemcpyHostToHost, streams[i]));

    cutilSafeCall(cudaMemcpyAsync(((char *) t_d) + i * size_block_t, ((char *) t_p) + i * size_block_t, size_block_t, cudaMemcpyHostToDevice, streams[i]));

    kernel<<<dimGrid,dimBlock,0,streams[i]>>>(t_d);

    cutilCheckMsg("Kernel execution failed");

    cutilSafeCall(cudaMemcpyAsync(((char *) t_p) + i * size_block_t, ((char *) t_d) + i * size_block_t, size_block_t, cudaMemcpyDeviceToHost, streams[i]));

while (cudaStreamQuery(streams[i]) != cudaSuccess); // wait

cutilSafeCall(cudaMemcpyAsync(((char *) t) + i * size_block_t, ((char *) t_p) + i * size_block_t, size_block_t, cudaMemcpyHostToHost, streams[i]));

}

cudaThreadSynchronize();

The results show that t is modified by the kernel; But if we comment out the while loop, t still has the original value, without being modified.

Can you post full source that I can compile directly?

tmurray,

The original code was wrapped in a Fortran application. Here is the code I wrote according to that, just to show the problem:
Actually, after looking at the results of this small function, I found even with the while loop, the second stream still has wrong results.

#include <stdio.h>
#include <cutil_inline.h>
#include <sys/types.h>
#include <sys/time.h>
#include <assert.h>
#include <time.h>
#include “cuda.h”
#include “util.h”

global void kernel(double* t, int i)
{
t[threadIdx.x+i*128] ++;
__syncthreads();
}

int main()
{
cudaStream_t streams;
size_t nstreams=2;
double t, t_p, t_d;
int n = 256, i, size_block_t=n
sizeof(double)/nstreams;
dim3 dimBlock(128,1);
dim3 dimGrid(1,1);
streams = (cudaStream_t ) malloc(nstreams * sizeof(cudaStream_t));
t = (double
)malloc(n
sizeof(double));
cudaMallocHost((void
)&t_p, nsizeof(double));
cudaMalloc((void**)&t_d, n
sizeof(double));
for(i=0;i<n;i++)t[i]=2.0;
for(i=0;i<n;i++)printf(“t[%d] is %lf\n”, i, t[i]);
assert(streams != NULL);
for (i = 0; i < nstreams; ++i) {
cutilSafeCall(cudaStreamCreate(&streams[i]));
}

for (i = 0; i < nstreams; ++i) {
    cutilSafeCall(cudaMemcpyAsync(((char *) t_p) + i * size_block_t, ((char *) t) + i * size_block_t, size_block_t, cudaMemcpyHostToHost, streams[i]));
    cutilSafeCall(cudaMemcpyAsync(((char *) t_d) + i * size_block_t, ((char *) t_p) + i * size_block_t, size_block_t, cudaMemcpyHostToDevice, streams[i]));
    kernel<<<dimGrid,dimBlock,0,streams[i]>>>(t_d, i);
    cutilCheckMsg("Kernel execution failed");
    cutilSafeCall(cudaMemcpyAsync(((char *) t_p) + i * size_block_t, ((char *) t_d) + i * size_block_t, size_block_t, cudaMemcpyDeviceToHost, streams[i]));
    while (cudaStreamQuery(streams[i]) != cudaSuccess); // wait
    cutilSafeCall(cudaMemcpyAsync(((char *) t) + i * size_block_t, ((char *) t_p) + i * size_block_t, size_block_t, cudaMemcpyHostToHost, streams[i]));
  }
    for(i=0;i<nstreams;i++)while (cudaStreamQuery(streams[i]) != cudaSuccess); // wait
cudaThreadSynchronize();
for(i=0;i<n;i++)printf("t[%d] is %lf\n",i, t[i]);
cudaFreeHost(t_p);
cudaFree(t_d);
free(t);
return 0;

}