Hi to all!
I am trying to get experienced with cuda streams, and i am trying to run concurrenly 3 seven very simple streams, just to see how streams work.
The code is presented below:
//-----------------------------------------------------------------------------
cudaStream_t stream1;
cudaStream_t stream2;
cudaStream_t stream3;
cudaStreamCreate (&stream1);
cudaStreamCreate (&stream2);
cudaStreamCreate (&stream3);
cudaMemcpyAsync( d_img,img,384512 * sizeof(int),cudaMemcpyHostToDevice,stream1 );
s1<<<1,1,0,stream1>>>(d_img,d_wrkx);
cudaMemcpyAsync( wrkx,d_wrkx,384512 * sizeof(int),cudaMemcpyDeviceToHost,stream1 );
cudaMemcpyAsync( d_img,img,384512 * sizeof(int),cudaMemcpyHostToDevice,stream2 );
s1<<<1,1,0,stream2>>>(d_img,d_wrkx);
cudaMemcpyAsync( wrkx,d_wrkx,384512 * sizeof(int),cudaMemcpyDeviceToHost,stream2 );
cudaMemcpyAsync( d_img,img,384512 * sizeof(int),cudaMemcpyHostToDevice,stream3 );
s1<<<1,1,0,stream3>>>(d_img,d_wrkx);
cudaMemcpyAsync( wrkx,d_wrkx,384512 * sizeof(int),cudaMemcpyDeviceToHost,stream3 );
cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);
cudaStreamSynchronize(stream3);
//---------------------------------------------------------------------------------------
I use the visual profiler nvprof to see the results of the executions , and i realise that the streams are not running in parallel. Unlikely, i have a seirial execution and i cannot understand why… For every kernel i use just 1 thread.
I use Jetson x1.
It would be very useful for me if you could help me to achieve fully concurrent streams.
Thaink you all!