cudaStream performance

Hello,

I made a program which use cudaStream to try performance on my shield tablet.

Here is the code:

#define WIDTH   6400
#define HEIGHT  4800
#define NB_STREAM 10

#define BLOC_X 32
#define BLOC_Y 32

cudaStream_t Stream[NB_STREAM];

cudaArray * Array_PatchsMaxDevice;
texture<u_int8_t, 2,cudaReadModeElementType> Image;

__global__ void SobelKernel(u_int8_t *ptDataOut,int hoffset,int widthToProcess,int heightToProcess)
{


    int x = blockIdx.x*blockDim.x;
    int y = blockIdx.y*blockDim.y;

    int xglobal = x + threadIdx.x;
    int yglobal = y + threadIdx.y;


    if(xglobal>=widthToProcess || yglobal >= heightToProcess  )
        return;

    //    atomicAdd(&ptDataOut[hoffset*WIDTH + xglobal +yglobal*WIDTH],30000);
    ptDataOut[hoffset*WIDTH + xglobal +yglobal*WIDTH] = tex2D(Image,xglobal,yglobal+hoffset);



}


void processFilter()
{

	u_int8_t *u8_PtImageHost;
	u_int8_t *u8_ptDataOutHost;
	u_int8_t *u8_ptDataOutDevice;
	u_int8_t u8_Used[NB_STREAM];


	u8_PtImageHost	 = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
	u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));



	checkCudaErrors(cudaMalloc((void**)&u8_ptDataOutDevice,WIDTH*HEIGHT*sizeof(u_int8_t)));
	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
	checkCudaErrors(cudaMallocArray(&Array_PatchsMaxDevice, &channelDesc,WIDTH,HEIGHT ));
	checkCudaErrors(cudaBindTextureToArray(Image,Array_PatchsMaxDevice));


	dim3 threads(BLOC_X,BLOC_Y);
	dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)HEIGHT/BLOC_Y));

	//ClearKernel<<<blocks,threads>>>(u8_ptDataOutDevice,WIDTH,HEIGHT);


	int blockh = HEIGHT/NB_STREAM;

	for(int i=0;i<NB_STREAM;i++)
	{
		cudaSetDevice(0);
		cudaStreamCreate(&Stream[i]);
	}


	cudaEvent_t Start;
	cudaEvent_t Stop;
	cudaEventCreate(&Start);
	cudaEventCreate(&Stop);

	cudaEventRecord(Start, 0);

	for(int i=0;i<NB_STREAM;i++)
	{
		if(i == 0)
		{
			int localHEIGHT  = blockh;
			checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
					0,
					0,
					u8_PtImageHost,
					WIDTH,
					WIDTH,
					blockh,
					cudaMemcpyHostToDevice  ,
					Stream[i]));

			dim3 threads(BLOC_X,BLOC_Y);
			dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)localHEIGHT/BLOC_Y));
			SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHEIGHT-1);
			checkCudaErrors(cudaGetLastError());

			checkCudaErrors(cudaMemcpyAsync(u8_ptDataOutHost,u8_ptDataOutDevice,WIDTH*(localHEIGHT-1)*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i]));
			u8_Used[i] = 1;

		}else{


			int ioffsetImage =  WIDTH*(HEIGHT/NB_STREAM  );
			int hoffset = HEIGHT/NB_STREAM *i;
			int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
			int localHEIGHT  = min(HEIGHT - (blockh*i),blockh);

			//printf("hoffset: %d hoffsetkernel %d localHEIGHT %d rest %d ioffsetImage %d \n",hoffset,hoffsetkernel,localHEIGHT,HEIGHT - (blockh +1 +blockh*(i-1)),ioffsetImage*i/WIDTH);

			checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
					0,
					hoffset,
					&u8_PtImageHost[ioffsetImage*i],
					WIDTH,
					WIDTH,
					localHEIGHT,
					cudaMemcpyHostToDevice  ,
					Stream[i]));


			dim3 threads(BLOC_X,BLOC_Y);
			dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)localHEIGHT/BLOC_Y));

			SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,hoffsetkernel,WIDTH,localHEIGHT);
			checkCudaErrors(cudaGetLastError());
			checkCudaErrors(cudaMemcpyAsync(&u8_ptDataOutHost[hoffsetkernel*WIDTH],&u8_ptDataOutDevice[hoffsetkernel*WIDTH],WIDTH*localHEIGHT*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i]));

			u8_Used[i] = 1;
			if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
			{
				break;
			}
		}
	}



	for(int i=0;i<NB_STREAM;i++)
	{
		cudaStreamSynchronize(Stream[i]);
	}

	cudaEventRecord(Stop, 0);

	cudaEventSynchronize(Start);
	cudaEventSynchronize(Stop);


	float dt_ms;
	cudaEventElapsedTime(&dt_ms, Start, Stop);

	printf("dt_ms %f \n",dt_ms);

	LOGD("dt_ms %f \n",dt_ms);

}

The problem is that there is no difference between stream size 1 or 10, the problem takes approximatly 120ms.

Is there a problem in my code?
Because it should be faster with 10 Streams? Should I have a huge difference?

I used NSight and I get a strange result.

Here is the profile of my program:

I don’t understand why the streams are waiting each other.

Because each kernel launch fills the GPU, so there is no possibility for concurrency.

		dim3 threads(BLOC_X,BLOC_Y);
		dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)localHEIGHT/BLOC_Y));

Thank you for the answer.
So I change to dim3 blocks(1,1) and threads(BLOC_X/8,BLOC_Y/8) but it still the same.

Now your kernels take so little time that there is no scheduling opportunity. The kernel launch overhead is a few microseconds, and your kernels are taking less than that to execute/complete.

Concurrent kernels is fairly hard to witness in practice.

Run and study the cuda sample concurrent kernels.

I made some modifications most of then about the size of the processed data.

In fact there is no problems, it is just the normal execution of the program. I just have few data so we can see more clearly the execution.
When I process more data, the latency between two executions is more tiny in percent.

So What is the minimum size of the processed data, to get signicative difference between a process with or without stream.

Because in my process, my sobel is apply to a 640x480 image and there is no difference between a process with or without stream.

I took a closer look at your code. Any time you are using cudaMemcpyAsync, you will most likely not get the expected behavior if your host memory allocations are not pinned. Therefore you should change this:

u8_PtImageHost	 = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));

to this:

checkCudaErrors(cudaHostAlloc(&u8_PtImageHost, WIDTH*HEIGHT*sizeof(u_int8_t), cudaHostAllocDefault));
checkCudaErrors(cudaHostAlloc(&u8_ptDataOutHost, WIDTH*HEIGHT*sizeof(u_int8_t), cudaHostAllocDefault));

When I made that change to your code, my execution time dropped from 26ms to 6ms. This still doesn’t address all the question about kernel concurrency, but since you are doing a depth-first launch:

checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
					0,
					0,
					u8_PtImageHost,
					WIDTH,
					WIDTH,
					blockh,
					cudaMemcpyHostToDevice  ,
					Stream[i]));

			dim3 threads(BLOC_X,BLOC_Y);
			dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)localHEIGHT/BLOC_Y));
			SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHEIGHT-1);
			checkCudaErrors(cudaGetLastError());

			checkCudaErrors(cudaMemcpyAsync(u8_ptDataOutHost,u8_ptDataOutDevice,WIDTH*(localHEIGHT-1)*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i]));

your cudaMemcpyAsync operations were actually blocking (since the host allocations referenced were not pinned) and so you could not possibly witness overlap, of anything.

This necessity is covered in the programming guide under asynchronous concurrent execution.

Thank you for the answer.