Weird behaviour of CUDA streams

Hi,

I’m trying to overlap memcpy and kernel execution using streams. If I do it as in the Programmer’s Guide example - first start all in-copy operations, then all kernel calls, then all out-copy operations - it works:

[codebox] for(int i=0;i<stream_count;i++) {

int begin=vector_size*i/stream_count;

int end=vector_size*(i+1)/stream_count;

int size=end-begin;

// copy input vector to gpu //

cudaMemcpyAsync(vector_input_gpu+begin,vector_input_cpu+begi

n,

                          size*sizeof(float),

                          cudaMemcpyHostToDevice,streams[i]);

}

for(int i=0;i<stream_count;i++) {

int begin=vector_size*i/stream_count;

int end=vector_size*(i+1)/stream_count;

int size=end-begin;

// cuda kernel call //

compute<<<30,32,0,streams[i]>>>(vector_input_gpu+begin,

                                                vector_output_gpu+begin,

                                                size);

}

for(int i=0;i<stream_count;i++) {

int begin=vector_size*i/stream_count;

int end=vector_size*(i+1)/stream_count;

int size=end-begin;

// copy output vector from gpu to cpu //

cudaMemcpyAsync(vector_output_cpu+begin,vector_output_gpu+be

gin,

                          size*sizeof(float),

                          cudaMemcpyDeviceToHost,streams[i]);

}[/codebox]

But I wonder why I cannot put everything in one loop - starting first all operations of stream0 then of stream1… - if try to do this it is as slow as the non-streamed version. It seems it doesn’t do any overlap then:

[codebox] for(int i=0;i<stream_count;i++) {

int begin=vector_size*i/stream_count;

int end=vector_size*(i+1)/stream_count;

int size=end-begin;

// copy input vector to gpu //

cudaMemcpyAsync(vector_input_gpu+begin,vector_input_cpu+begi

n,

                          size*sizeof(float),

                          cudaMemcpyHostToDevice,streams[i]);

// cuda kernel call //

compute<<<30,32,0,streams[i]>>>(vector_input_gpu+begin,

                                                vector_output_gpu+begin,

                                                size);

// copy output vector from gpu to cpu //

cudaMemcpyAsync(vector_output_cpu+begin,vector_output_gpu+be

gin,

                          size*sizeof(float),

                          cudaMemcpyDeviceToHost,streams[i]);

}[/codebox]

Can someone explain this to me?

If I just combine in-copy and kernel or just kernel and out-copy in one loop it works fine, but not if I combine all three. I think this is really weird.