Writing a kernel efficiently uneven indexing for loop

I have implemented my multiply+accumulate kernel in the following way:

//int blocksize = 256; //multiple of 32

//int nblocks = ((pix3*npix*npix)+blocksize-1)/blocksize; //round to max where pix3 =400*50, npix = 7

// printf("\nnblocks = %d\n", nblocks);

// xc_corr <<< nblocks,blocksize >>> (ffcorr1, ref_d, pix1, pix2, npix, xcout, n, xc_partial, sum_xc);

__global__ void xc_corr(double* in_im, double* ref_im, int nx, int ny, int npix, double* out1, int n, double* xc_partial, double* sum_xc)

{

	int k = blockIdx.x * blockDim.x + threadIdx.x;

	int i = 0;

	int j = 0;

	int p = 0;

	int q = 0;

	if(k < n) //n = 50

	{

		for(i = 0; i < npix; ++i)

		{

			for(j = 0; j < npix; ++j)

			{

				sum_xc[k]  = 0; //force update

				for(p = 0; p < ny; ++p) //ny = 400

				{

					for(q = 0; q < ny; ++q) //ny = 400

					{					

						xc_partial[(k*ny*ny)+((i*npix+j)*ny*ny)+p*ny+q] = in_im[(k*ny*ny)+p*ny+q] * ref_im[(k*ny*ny)+((i*npix+j)*ny*ny)+p*ny+q]; //array multiplication

						sum_xc[k] = sum_xc[k] + xc_partial[(k*ny*ny)+((i*npix+j)*ny*ny)+p*ny+q]; //reduction

					}

				}

				out1[(k*npix*npix)+i*npix+j] = sum_xc[k];

			}

		}

	}	

}

When I profiled this kernel it took the maximum time for execution about 136ms (98% of the total execution time). I would like to optimize it and see if I could reduce the execution time. I am lost regarding where to optimize for?

My thoughts were:

  1. If I could convert all the for loops to utilize the threads executing per block, would that be a better alternative?

  2. Would shared memory be a better alternative?

My ptxas output loks like this:

ptxas info : Compiling entry function ‘Z7xc_corrPdS_iiiS_iS_S’ for ‘sm_13’

ptxas info : Used 15 registers, 64+16 bytes smem, 4 bytes cmem[1]

Any feedback/suggestions welcome.

Thanks :)

I think using either share memory or automatic variables to substitute your current array elements xc_partial and sum_xc would be more efficient.

I think using either share memory or automatic variables to substitute your current array elements xc_partial and sum_xc would be more efficient.

I rewrote my kernel this way:

//int blocksize = 16; //multiple of 32

//int nblocks = ((pix3)+blocksize-1)/blocksize; //round to max pix3 = 400, nblocks = 25

//dim3 dimBlock(blocksize,blocksize);//(16,16)

//dim3 dimGrid(nblocks,nblocks);//(25,25)

//xc_corr <<< dimGrid,dimBlock >>> (ffcorr1, ref_d, pix1, pix2, npix, xcout, n, xc_partial, sum_xc);

__global__ void xc_corr(double* in_im, double* ref_im, int nx, int ny, int npix, double* out1, int n, double* xc_partial, double* sum_xc)

{

	int p = blockIdx.x * blockDim.x + threadIdx.x;

	int q = blockIdx.y * blockDim.y + threadIdx.y;

	int k = threadIdx.x;

	int i = threadIdx.x;

	int j = threadIdx.y;

	

	if(k < n) // k =1

	{

		if(i<npix && j<npix) //npix = 7

		{

				sum_xc[k]  = 0; //force update

				if(p<ny && q<ny) //ny = 20

				{			

						xc_partial[(k*ny*ny)+((i*npix+j)*ny*ny)+p*ny+q] = in_im[(k*ny*ny)+p*ny+q] * ref_im[(k*ny*ny)+((i*npix+j)*ny*ny)+p*ny+q];

						sum_xc[k] = sum_xc[k] + xc_partial[(k*ny*ny)+((i*npix+j)*ny*ny)+p*ny+q];

				}

				out1[(k*npix*npix)+i*npix+j] = sum_xc[k];

		 }

	}

}

This reduced the kernel time from 138 ms to about 2 ms and gave correct results for k<1

However when I try to scale this kernel for k = 50, the kernel gives incorrect results.

The original C code looks like this:

for(k = 0; k < n; ++k)

{

	for(i = 0; i < npix; ++i)

	{

		for(j = 0; j < npix; ++j)

		{

			sum_xc[k]  = 0; //force update

			for(p = 0; p < pix1; ++p)

			{

				for(q = 0; q < pix1; ++q)

				{			

					xc_partial[(k*pix1*pix1)+((i*npix+j)*pix1*pix1)+p*pix1+q] = ffcorr1[(k*pix1*pix1)+p*pix1+q] * ref[(k*pix1*pix1)+((i*npix+j)*pix1*pix1)+p*pix1+q];

					sum_xc[k] = sum_xc[k] + xc_partial[(k*pix1*pix1)+((i*npix+j)*pix1*pix1)+p*pix1+q];

				}

			}

			xcout[(k*npix*npix)+i*npix+j] = sum_xc[k];

		}

	}

}

Any suggestions on what is wrong or how I can rewrite my kernel more efficiently?

I rewrote my kernel this way:

//int blocksize = 16; //multiple of 32

//int nblocks = ((pix3)+blocksize-1)/blocksize; //round to max pix3 = 400, nblocks = 25

//dim3 dimBlock(blocksize,blocksize);//(16,16)

//dim3 dimGrid(nblocks,nblocks);//(25,25)

//xc_corr <<< dimGrid,dimBlock >>> (ffcorr1, ref_d, pix1, pix2, npix, xcout, n, xc_partial, sum_xc);

__global__ void xc_corr(double* in_im, double* ref_im, int nx, int ny, int npix, double* out1, int n, double* xc_partial, double* sum_xc)

{

	int p = blockIdx.x * blockDim.x + threadIdx.x;

	int q = blockIdx.y * blockDim.y + threadIdx.y;

	int k = threadIdx.x;

	int i = threadIdx.x;

	int j = threadIdx.y;

	

	if(k < n) // k =1

	{

		if(i<npix && j<npix) //npix = 7

		{

				sum_xc[k]  = 0; //force update

				if(p<ny && q<ny) //ny = 20

				{			

						xc_partial[(k*ny*ny)+((i*npix+j)*ny*ny)+p*ny+q] = in_im[(k*ny*ny)+p*ny+q] * ref_im[(k*ny*ny)+((i*npix+j)*ny*ny)+p*ny+q];

						sum_xc[k] = sum_xc[k] + xc_partial[(k*ny*ny)+((i*npix+j)*ny*ny)+p*ny+q];

				}

				out1[(k*npix*npix)+i*npix+j] = sum_xc[k];

		 }

	}

}

This reduced the kernel time from 138 ms to about 2 ms and gave correct results for k<1

However when I try to scale this kernel for k = 50, the kernel gives incorrect results.

The original C code looks like this:

for(k = 0; k < n; ++k)

{

	for(i = 0; i < npix; ++i)

	{

		for(j = 0; j < npix; ++j)

		{

			sum_xc[k]  = 0; //force update

			for(p = 0; p < pix1; ++p)

			{

				for(q = 0; q < pix1; ++q)

				{			

					xc_partial[(k*pix1*pix1)+((i*npix+j)*pix1*pix1)+p*pix1+q] = ffcorr1[(k*pix1*pix1)+p*pix1+q] * ref[(k*pix1*pix1)+((i*npix+j)*pix1*pix1)+p*pix1+q];

					sum_xc[k] = sum_xc[k] + xc_partial[(k*pix1*pix1)+((i*npix+j)*pix1*pix1)+p*pix1+q];

				}

			}

			xcout[(k*npix*npix)+i*npix+j] = sum_xc[k];

		}

	}

}

Any suggestions on what is wrong or how I can rewrite my kernel more efficiently?

suggestions??

suggestions??

Hi

If I am reading it correctly you have

xc_partial[  ((i*npix + j + k)*pix1Squared) +p*pix1 +q] 

		=  ref[  ((i*npix + j + k)*pix1Squared) +p*pix1 +q]  * ffcorr1[( k*pix1Squared) + p*pix1 +q];

So first 2 things I see are that you have 5 loop variables (in C code) and are working with a 4D object

k and j are both mulitplied by pix1Squared so I combined them for readability

Is that correct and the way you intended it ?

Could be useful to add a comment showing the code as multidimensional arrays e.g.

// xc_partial[ i, j+k, p, q] = ref_im[ i, j+k, p, q] * ffcorr1[ k, p, q ]

And also a comment as to what the code is supposed to do

NB for any thread ffcorr1[(kpix1pix1)+p*pix1+q] will always be the same value as k, p, q only involve threadIdx and blockIdx

so in the cuda code you could hold that in a variable, the compiler may do it for you but it makes the code easier to read.

With your rewritten code each thread

reads a single value from each of 4 arrays (expensive )

does one simple calculation

stores results in three arrays (expensive)

and exits

1st question is are all 3 result arrays needed later ?

sum_xc is in device memory, and k = threadIDx.x; remember every block has 16 threads with threadIdx.x = 0 or 1 or 15

so in this

sum_xc[k] = sum_xc[k] + ...

sum_xc[k] will actually be updated by a great many threads at same time. i.e. they overwrite each others results

You can fix this by changing the design, and at same time improve performance with other design changes.

If you reply with the two comments I suggested it will be easier for people to help you.

Cheers,

Hi

If I am reading it correctly you have

xc_partial[  ((i*npix + j + k)*pix1Squared) +p*pix1 +q] 

		=  ref[  ((i*npix + j + k)*pix1Squared) +p*pix1 +q]  * ffcorr1[( k*pix1Squared) + p*pix1 +q];

So first 2 things I see are that you have 5 loop variables (in C code) and are working with a 4D object

k and j are both mulitplied by pix1Squared so I combined them for readability

Is that correct and the way you intended it ?

Could be useful to add a comment showing the code as multidimensional arrays e.g.

// xc_partial[ i, j+k, p, q] = ref_im[ i, j+k, p, q] * ffcorr1[ k, p, q ]

And also a comment as to what the code is supposed to do

NB for any thread ffcorr1[(kpix1pix1)+p*pix1+q] will always be the same value as k, p, q only involve threadIdx and blockIdx

so in the cuda code you could hold that in a variable, the compiler may do it for you but it makes the code easier to read.

With your rewritten code each thread

reads a single value from each of 4 arrays (expensive )

does one simple calculation

stores results in three arrays (expensive)

and exits

1st question is are all 3 result arrays needed later ?

sum_xc is in device memory, and k = threadIDx.x; remember every block has 16 threads with threadIdx.x = 0 or 1 or 15

so in this

sum_xc[k] = sum_xc[k] + ...

sum_xc[k] will actually be updated by a great many threads at same time. i.e. they overwrite each others results

You can fix this by changing the design, and at same time improve performance with other design changes.

If you reply with the two comments I suggested it will be easier for people to help you.

Cheers,

@kbam: Thanks for the comments!!!

Description of the C program:

n = 50, npix = 7, pix1 = 20

ffcorr1 is an array of 400n elements and ref is an array of 400npixnpixn elements. xc_partial is an array of 400npixnpixn elements which is then reduced to sum_xc. sum_xc is an array of npixnpix*n elements. This is an implementation of 2D cross-correlation without using FFTs.

So i and j loops are offsets for accessing the ref array correctly so that the correct indices are multiplied with the indices of the ffcorr array.

So if k = 0,

i = 0, the ffcorr index goes from 0 to 399 and ref index goes from 0 to 399

i = 1, the ffcorr index goes from 0 to 399 and ref index goes from 400 to 799

so when i =7 ffcorr index goes from 0 to 399 and ref index goes from 19200 to 19599

at the same time sum_xc is an array of npix*npix values.

for(k = 0; k < n; ++k)

{

	for(i = 0; i < npix; ++i)

	{

		for(j = 0; j < npix; ++j)

		{

			sum_xc[k]  = 0; //force update

			for(p = 0; p < pix1; ++p)

			{

				for(q = 0; q < pix1; ++q)

				{			

					xc_partial[(k*pix1*pix1)+((i*npix+j)*pix1*pix1)+p*pix1+q] = ffcorr1[(k*pix1*pix1)+p*pix1+q] * ref[(k*pix1*pix1)+((i*npix+j)*pix1*pix1)+p*pix1+q];

					sum_xc[k] = sum_xc[k] + xc_partial[(k*pix1*pix1)+((i*npix+j)*pix1*pix1)+p*pix1+q];

				}

			}

			xcout[(k*npix*npix)+i*npix+j] = sum_xc[k];

		}

	}

}

My first attempt at implementing this kernel in CUDA revolved on just using one index, i.e

int blocksize = 16; //multiple of 32

int nblocks = ((pix3)+blocksize-1)/blocksize; //round to max pix3 = 400, nblocks = 25

xc_corr <<nblocks,blocksize>> (...)

I profiled this kernel and found the execution time to be 130 ms but the results were correct.

Then I decided to use

dim3 dimBlock(blocksize,blocksize);//(16,16)

dim3 dimGrid(nblocks,nblocks);//(25,25)

But using 2D grid gives me correct results only when k = 0.

Thanks in advance for suggestions regarding optimizing this kernel :)

@kbam: Thanks for the comments!!!

Description of the C program:

n = 50, npix = 7, pix1 = 20

ffcorr1 is an array of 400n elements and ref is an array of 400npixnpixn elements. xc_partial is an array of 400npixnpixn elements which is then reduced to sum_xc. sum_xc is an array of npixnpix*n elements. This is an implementation of 2D cross-correlation without using FFTs.

So i and j loops are offsets for accessing the ref array correctly so that the correct indices are multiplied with the indices of the ffcorr array.

So if k = 0,

i = 0, the ffcorr index goes from 0 to 399 and ref index goes from 0 to 399

i = 1, the ffcorr index goes from 0 to 399 and ref index goes from 400 to 799

so when i =7 ffcorr index goes from 0 to 399 and ref index goes from 19200 to 19599

at the same time sum_xc is an array of npix*npix values.

for(k = 0; k < n; ++k)

{

	for(i = 0; i < npix; ++i)

	{

		for(j = 0; j < npix; ++j)

		{

			sum_xc[k]  = 0; //force update

			for(p = 0; p < pix1; ++p)

			{

				for(q = 0; q < pix1; ++q)

				{			

					xc_partial[(k*pix1*pix1)+((i*npix+j)*pix1*pix1)+p*pix1+q] = ffcorr1[(k*pix1*pix1)+p*pix1+q] * ref[(k*pix1*pix1)+((i*npix+j)*pix1*pix1)+p*pix1+q];

					sum_xc[k] = sum_xc[k] + xc_partial[(k*pix1*pix1)+((i*npix+j)*pix1*pix1)+p*pix1+q];

				}

			}

			xcout[(k*npix*npix)+i*npix+j] = sum_xc[k];

		}

	}

}

My first attempt at implementing this kernel in CUDA revolved on just using one index, i.e

int blocksize = 16; //multiple of 32

int nblocks = ((pix3)+blocksize-1)/blocksize; //round to max pix3 = 400, nblocks = 25

xc_corr <<nblocks,blocksize>> (...)

I profiled this kernel and found the execution time to be 130 ms but the results were correct.

Then I decided to use

dim3 dimBlock(blocksize,blocksize);//(16,16)

dim3 dimGrid(nblocks,nblocks);//(25,25)

But using 2D grid gives me correct results only when k = 0.

Thanks in advance for suggestions regarding optimizing this kernel :)

suggestions?

suggestions?

suggestions?

suggestions?